6 files changed, 244 insertions, 26 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 576a4f3ec..60a07d6eb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -76,6 +76,8 @@ class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType, MakePointer_>,
   typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
 
+  static const int NumDims = Eigen::internal::traits<TensorEvalToOp>::NumDimensions;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr)
       : m_xpr(expr), m_buffer(buffer) {}
 
@@ -106,15 +108,22 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
   typedef StorageMemory<CoeffReturnType, Device> Storage;
   typedef typename Storage::Type EvaluatorPointerType;
   enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
+    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = true,
     PreferBlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = true
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = true
   };
 
+  typedef typename internal::TensorBlock<
+      CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
+      TensorBlock;
+  typedef typename internal::TensorBlockReader<
+      CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
+      TensorBlockReader;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){}
 
@@ -138,6 +147,18 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
     internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    m_impl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
+    TensorBlock eval_to_block(block->first_coeff_index(), block->block_sizes(),
+                              block->tensor_strides(), block->tensor_strides(),
+                              m_buffer + block->first_coeff_index());
+    m_impl.block(&eval_to_block);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
@@ -153,6 +174,11 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
     return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
+    assert(m_buffer != NULL);
+    TensorBlockReader::Run(block, m_buffer);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     // We assume that evalPacket or evalScalar is called to perform the
     // assignment and account for the cost of the write here.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 47e9b24ec..f1ae548f7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -346,7 +346,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
             // expressions.
             const int thread_idx = device.currentThreadId();
             eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
-            Scalar* thread_buf = reinterpret_cast<Scalar*>(
+            ScalarNoConst* thread_buf = reinterpret_cast<ScalarNoConst*>(
                 static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
             for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
               auto block = block_mapper.GetBlockForIndex(i, thread_buf);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index e7b7c1e6b..27a15bfb2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -133,7 +133,12 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
     typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
     EvalTo evalToTmp(m_device.get(m_buffer), m_op);
     const bool Vectorize = internal::IsVectorizable<Device, const ArgType>::value;
-    internal::TensorExecutor<const EvalTo, typename internal::remove_const<Device>::type, Vectorize>::run(evalToTmp, m_device);
+    const bool Tile = TensorEvaluator<const ArgType, Device>::BlockAccess &&
+                      TensorEvaluator<const ArgType, Device>::PreferBlockAccess;
+
+    internal::TensorExecutor<const EvalTo,
+                             typename internal::remove_const<Device>::type,
+                             Vectorize, Tile>::run(evalToTmp, m_device);
     return true;
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 3a699095c..64fed6de7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -113,18 +113,25 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = true,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false,
+
   };
 
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+      OutputTensorBlock;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
                                                         const Device& device)
-      : m_impl(op.expression(), device), m_reverse(op.reverse())
+      : m_impl(op.expression(), device),
+        m_reverse(op.reverse()),
+        m_device(device)
   {
     // Reversing a scalar isn't supported yet. It would be a no-op anyway.
     EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -142,6 +149,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
         m_strides[i] = m_strides[i+1] * m_dimensions[i+1];
       }
     }
+    // Remember the strides for fast division.
+    for (int i = 0; i < NumDims; ++i) {
+      m_fastStrides[i] = internal::TensorIntDivisor<Index>(m_strides[i]);
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -162,7 +173,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
-        Index idx = index / m_strides[i];
+        Index idx = index / m_fastStrides[i];
         index -= idx * m_strides[i];
         if (m_reverse[i]) {
           idx = m_dimensions[i] - idx - 1;
@@ -177,7 +188,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     } else {
       EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
-        Index idx = index / m_strides[i];
+        Index idx = index / m_fastStrides[i];
         index -= idx * m_strides[i];
         if (m_reverse[i]) {
           idx = m_dimensions[i] - idx - 1;
@@ -217,6 +228,131 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     return rslt;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
+        1, m_device.lastLevelCacheSize() / sizeof(Scalar));
+    resources->push_back(internal::TensorOpResourceRequirements(
+        internal::kSkewedInnerDims, block_total_size_max));
+  }
+
+  struct BlockIteratorState {
+    Index block_size;
+    Index block_stride;
+    Index block_span;
+    Index input_size;
+    Index input_stride;
+    Index input_span;
+    Index count;
+    bool reverse;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+      OutputTensorBlock* output_block) const {
+    if (NumDims <= 0) return;
+
+    // TODO(ezhulenev): If underlying tensor expression supports and prefers
+    // block evaluation we must use it. Currently we use coeff and packet
+    // access into the underlying tensor expression.
+    // static const bool useBlockAccessForArgType =
+    //     TensorEvaluator<ArgType, Device>::BlockAccess &&
+    //     TensorEvaluator<ArgType, Device>::PreferBlockAccess;
+
+    static const bool isColMajor =
+        static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+    static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
+    const bool inner_dim_reversed = m_reverse[inner_dim_idx];
+
+    CoeffReturnType* data = output_block->data();
+    Index block_offset = 0;
+
+    Index input_offset = reverseIndex(output_block->first_coeff_index());
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+    for (Index i = 0; i < NumDims; ++i) {
+      const Index dim = isColMajor ? i : NumDims - 1 - i;
+      it[i].block_size = output_block->block_sizes()[dim];
+      it[i].block_stride = output_block->block_strides()[dim];
+      it[i].block_span = it[i].block_stride * (it[i].block_size - 1);
+      it[i].input_size = m_dimensions[dim];
+      it[i].input_stride = m_strides[dim];
+      it[i].input_span = it[i].input_stride * (it[i].input_size - 1);
+      it[i].count = 0;
+      it[i].reverse = m_reverse[dim];
+
+      if (it[i].reverse) {
+        it[i].input_stride = -1 * it[i].input_stride;
+        it[i].input_span = -1 * it[i].input_span;
+      }
+    }
+
+    // If multiple inner dimensions have the same reverse flag, check if we can
+    // merge them into a single virtual inner dimension.
+    int effective_inner_dim = 0;
+    for (int i = 1; i < NumDims; ++i) {
+      if (it[i].reverse != it[effective_inner_dim].reverse) break;
+      if (it[i].block_stride != it[effective_inner_dim].input_size) break;
+      if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
+
+      it[i].block_size = it[effective_inner_dim].block_size * it[i].block_size;
+      it[i].input_size = it[effective_inner_dim].input_size * it[i].input_size;
+
+      it[i].block_stride = 1;
+      it[i].input_stride = (inner_dim_reversed ? -1 : 1);
+
+      it[i].block_span = it[i].block_stride * (it[i].block_size - 1);
+      it[i].input_span = it[i].input_stride * (it[i].input_size - 1);
+
+      effective_inner_dim = i;
+    }
+
+    eigen_assert(it[effective_inner_dim].block_stride == 1);
+    eigen_assert(it[effective_inner_dim].input_stride ==
+                 (inner_dim_reversed ? -1 : 1));
+
+    const Index inner_dim_size = it[effective_inner_dim].block_size;
+
+    while (it[NumDims - 1].count < it[NumDims - 1].block_size) {
+      // Copy inner-most dimension data from reversed location in input.
+      Index dst = block_offset;
+      Index src = input_offset;
+
+      // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed
+      // worse results in benchmarks than a simple coefficient loop.
+      if (inner_dim_reversed) {
+        for (Index i = 0; i < inner_dim_size; ++i) {
+          data[dst] = m_impl.coeff(src);
+          ++dst;
+          --src;
+        }
+      } else {
+        for (Index i = 0; i < inner_dim_size; ++i) {
+          data[dst] = m_impl.coeff(src);
+          ++dst;
+          ++src;
+        }
+      }
+
+      // For the 1d tensor we need to generate only one inner-most dimension.
+      if ((NumDims - effective_inner_dim) == 1) break;
+
+      // Update offset.
+      for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
+        if (++it[i].count < it[i].block_size) {
+          block_offset += it[i].block_stride;
+          input_offset += it[i].input_stride;
+          break;
+        }
+        if (i != NumDims - 1) it[i].count = 0;
+        block_offset -= it[i].block_span;
+        input_offset -= it[i].input_span;
+      }
+    }
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
                                      2 * TensorOpCost::MulCost<Index>() +
@@ -242,8 +378,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
  protected:
   Dimensions m_dimensions;
   array<Index, NumDims> m_strides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastStrides;
   TensorEvaluator<ArgType, Device> m_impl;
   ReverseDimensions m_reverse;
+  const Device& m_device;
 };
 
 // Eval as lvalue
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index ae04785ce..ad6332179 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -273,6 +273,11 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
             input_block_strides[i + 1] * input_block_sizes[i + 1];
       }
     }
+    DSizes<internal::TensorIntDivisor<Index>, NumDims> fast_input_block_strides;
+    for (int i = 0; i < NumDims; ++i) {
+      fast_input_block_strides[i] =
+          internal::TensorIntDivisor<Index>(input_block_strides[i]);
+    }
 
     // Read input block.
     TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
@@ -296,8 +301,9 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
         continue;
       }
 
-      Index output_index = GetBlockOutputIndex(input_index, input_block_strides,
-                                               output_block_strides);
+      Index output_index =
+          GetBlockOutputIndex(input_index, input_block_strides,
+                              output_block_strides, fast_input_block_strides);
       if (output_index == input_index) {
         // Coefficient already in place.
         bitmap[output_index] = true;
@@ -315,8 +321,9 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
         data[output_index] = shuffled_value;
         shuffled_value = evicted_value;
         bitmap[output_index] = true;
-        output_index = GetBlockOutputIndex(output_index, input_block_strides,
-                                           output_block_strides);
+        output_index =
+            GetBlockOutputIndex(output_index, input_block_strides,
+                                output_block_strides, fast_input_block_strides);
       } while (output_index != input_index);
 
       data[output_index] = shuffled_value;
@@ -345,11 +352,12 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
       Index input_index,
       const DSizes<Index, NumDims>& input_block_strides,
-      const DSizes<Index, NumDims>& output_block_strides) const {
+      const DSizes<Index, NumDims>& output_block_strides,
+      const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const {
     Index output_index = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = input_index / input_block_strides[i];
+        const Index idx = input_index / fast_input_block_strides[i];
         output_index += idx * output_block_strides[m_inverseShuffle[i]];
         input_index -= idx * input_block_strides[i];
       }
@@ -357,7 +365,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
           output_block_strides[m_inverseShuffle[0]];
     } else {
       for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = input_index / input_block_strides[i];
+        const Index idx = input_index / fast_input_block_strides[i];
         output_index += idx * output_block_strides[m_inverseShuffle[i]];
         input_index -= idx * input_block_strides[i];
       }
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index 8ea03382d..e9922a48d 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -527,6 +527,41 @@ static void test_execute_generator_op(Device d)
   }
 }
 
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    bool Tileable, int Layout>
+static void test_execute_reverse_rvalue(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims));
+  Tensor <T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Reverse half of the dimensions.
+  Eigen::array<bool, NumDims> reverse;
+  for (int i = 0; i < NumDims; ++i) reverse[i] = (dims[i] % 2 == 0);
+
+  const auto expr = src.reverse(reverse);
+
+  // We assume that reversing on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor <T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the reversing using configured tensor executor.
+  Tensor <T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+    internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
 #ifdef EIGEN_DONT_VECTORIZE
 #define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL
 #else 
@@ -619,8 +654,14 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
   CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
   CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
 
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
+
   // Force CMake to split this test.
-  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14
 }
 
 #undef CALL_SUBTEST_COMBINATIONS