From 83c0a16baf5ecac6288cd9b74536a82de8985b31 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 31 Jul 2018 15:56:31 -0700
Subject: Add block evaluation support to TensorOps

---
 .../Eigen/CXX11/src/Tensor/TensorReduction.h       | 437 ++++++++++++++++++++-
 1 file changed, 431 insertions(+), 6 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 375fc0802..05c0990dc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -356,6 +356,70 @@ template <int NPT, typename S, typename R, typename I>
 __global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 #endif
 
+template <typename Self, typename Op,
+          bool Vectorizable =
+              (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess)>
+class BlockReducer {
+ public:
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::PacketReturnType PacketReturnType;
+  explicit BlockReducer(const Op& reducer) : op_(reducer) {
+    accum_ = op_.initialize();
+  }
+  void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
+    for (Index i = 0; i < num_values_to_reduce; ++i) {
+      op_.reduce(data[index + i], &accum_);
+    }
+  }
+  CoeffReturnType Finalize() { return op_.finalize(accum_); }
+  PacketReturnType FinalizePacket() {
+    // TODO(andydavis) This function should not be called for Scalar
+    // reductions: clean this up or add an assert here.
+    return PacketReturnType();
+  }
+
+ private:
+  CoeffReturnType accum_;
+  Op op_;
+};
+
+template <typename Self, typename Op>
+class BlockReducer<Self, Op, true> {
+ public:
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::PacketReturnType PacketReturnType;
+  static const Index PacketSize =
+      internal::unpacket_traits<PacketReturnType>::size;
+
+  explicit BlockReducer(const Op& reducer) : op_(reducer) {
+    vaccum_ = op_.template initializePacket<PacketReturnType>();
+    accum_ = op_.initialize();
+  }
+  void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
+    const Index vectorized_size =
+        (num_values_to_reduce / PacketSize) * PacketSize;
+    for (Index i = 0; i < vectorized_size; i += PacketSize) {
+      op_.reducePacket(
+          internal::ploadt<PacketReturnType, Unaligned>(&data[index + i]),
+          &vaccum_);
+    }
+    for (Index i = vectorized_size; i < num_values_to_reduce; ++i) {
+      op_.reduce(data[index + i], &accum_);
+    }
+  }
+  CoeffReturnType Finalize() { return op_.finalizeBoth(accum_, vaccum_); }
+  PacketReturnType FinalizePacket() { return op_.finalizePacket(vaccum_); }
+
+ private:
+  PacketReturnType vaccum_;
+  CoeffReturnType accum_;
+  Op op_;
+};
+
 }  // end namespace internal
 
 
@@ -394,6 +458,7 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType,
 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
 {
+  typedef internal::reducer_traits<Op, Device> ReducerTraits;
   typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
   typedef typename XprType::Index Index;
   typedef ArgType ChildType;
@@ -410,14 +475,19 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned = false,
+    IsAligned    = false,
     PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
+    Layout       = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess  = false,  // to be implemented
+    RawAccess    = false
   };
 
+  using ScalarNoConst =  typename internal::remove_const<Scalar>::type;
+
+  using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>;
+  using InputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
+
   static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
   static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
   static const bool RunningFullReduction = (NumOutputDims==0);
@@ -451,11 +521,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
         m_outputStrides[0] = 1;
         for (int i = 1; i < NumOutputDims; ++i) {
           m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
         }
       } else {
-        m_outputStrides.back() = 1;
+        m_outputStrides[NumOutputDims - 1] = 1;
         for (int i = NumOutputDims - 2; i >= 0; --i) {
           m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
         }
       }
     }
@@ -483,6 +555,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
           ++reduceIndex;
         } else {
           m_preservedStrides[outputIndex] = input_strides[i];
+          m_output_to_input_dim_map[outputIndex] = i;
           ++outputIndex;
         }
       }
@@ -492,6 +565,16 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
     if (NumOutputDims == 0) {
       m_preservedStrides[0] = internal::array_prod(input_dims);
     }
+
+    m_numValuesToReduce =
+        NumOutputDims == 0
+            ? internal::array_prod(input_dims)
+            : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
+                  ? m_preservedStrides[0]
+                  : m_preservedStrides[NumOutputDims - 1];
+
+    m_block_total_size_max =
+        numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -686,6 +769,265 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
     }
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    resources->push_back(internal::TensorOpResourceRequirements(
+        internal::TensorBlockShapeType::kSkewedInnerDims,
+        m_block_total_size_max));
+    m_impl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void block(
+      OutputTensorBlock* output_block) const {
+    // Special case full reductions to avoid input block copy below.
+    if (NumInputDims == NumReducedDims) {
+      eigen_assert(output_block->first_coeff_index() == 0);
+      eigen_assert(output_block->block_sizes().TotalSize() == 1);
+      Op reducer(m_reducer);
+      output_block->data()[0] = internal::InnerMostDimReducer<Self, Op>::reduce(
+          *this, 0, m_numValuesToReduce, reducer);
+      return;
+    }
+
+    // Calculate input tensor 'slice' required to reduce output block coeffs.
+    DSizes<Index, NumInputDims> input_slice_sizes(m_impl.dimensions());
+    for (int i = 0; i < NumOutputDims; ++i) {
+      // Clip preserved input dimensions by output block size.
+      input_slice_sizes[m_output_to_input_dim_map[i]] =
+          output_block->block_sizes()[i];
+    }
+
+    // Shard input tensor slice into blocks (because it could be large if we
+    // need to reduce along several dimensions to calculate required output
+    // coefficients).
+    const Index max_coeff_count =
+        numext::mini<Index>(((m_device.firstLevelCacheSize()) / sizeof(Scalar)),
+                            input_slice_sizes.TotalSize());
+
+    // Calculate max output shard size needed to keep working set of reducers
+    // in L1, while leaving enough space for reducer overhead and 'PacketSize'
+    // reductions.
+    DSizes<Index, NumInputDims> target_input_block_sizes;
+    CalculateTargetInputBlockShape(max_coeff_count, input_slice_sizes,
+                                   &target_input_block_sizes);
+    // Calculate indices for first preserved dimension.
+    const Index first_preserved_dim_output_index =
+        static_cast<int>(Layout) == static_cast<int>(ColMajor)
+            ? 0
+            : NumOutputDims - 1;
+    const Index first_preserved_dim_input_index =
+        m_output_to_input_dim_map[first_preserved_dim_output_index];
+    const bool inner_most_dim_preserved =
+        first_preserved_dim_input_index ==
+            (static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                 ? 0
+                 : NumInputDims - 1) |
+        PreservingInnerMostDims;
+
+    // Calculate output block inner/outer dimension sizes.
+    const Index output_block_inner_dim_size =
+        output_block->block_sizes()[first_preserved_dim_output_index];
+    const Index output_block_outer_dim_size =
+        output_block->block_sizes().TotalSize() / output_block_inner_dim_size;
+    // Calculate shard size for first preserved dimension.
+    const Index output_shard_size =
+        target_input_block_sizes[first_preserved_dim_input_index];
+    const Index num_output_shards =
+        (output_block_inner_dim_size + output_shard_size - 1) /
+        output_shard_size;
+
+    // Initialize 'tensor_slice_offsets' from input coords of output index.
+    DSizes<Index, NumInputDims> tensor_slice_offsets;
+    GetInputCoordsForOutputIndex(output_block->first_coeff_index(),
+                                 &tensor_slice_offsets);
+
+    // Store tensor slice offset in first preserved dimension to be used
+    // to update tensor slice extents in loop below.
+    const Index first_preserved_dim_offset_start =
+        tensor_slice_offsets[first_preserved_dim_input_index];
+
+    array<BlockIteratorState, NumOutputDims> block_iter_state;
+
+    // Initialize state used to iterate through output coefficients
+    // and update 'tensor_slice_offsets' in outer preserved dims.
+    for (int i = 0; i < NumOutputDims - 1; ++i) {
+      const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                          ? i + 1
+                          : NumOutputDims - i - 2;
+      block_iter_state[i].input_dim = m_output_to_input_dim_map[dim];
+      block_iter_state[i].output_size = output_block->block_sizes()[dim];
+      block_iter_state[i].output_count = 0;
+    }
+
+    // Allocate input block memory.
+    ScalarNoConst* input_block_data = static_cast<ScalarNoConst*>(
+        m_device.allocate(max_coeff_count * sizeof(Scalar)));
+    // Allocate reducer memory.
+    const bool packet_reductions_enabled =
+        (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess);
+    const Index num_reducers =
+        (inner_most_dim_preserved && packet_reductions_enabled)
+            ? (output_shard_size / PacketSize + output_shard_size % PacketSize +
+               PacketSize)
+            : output_shard_size;
+    typedef internal::BlockReducer<Self, Op> BlockReducer;
+    BlockReducer* reducers = static_cast<BlockReducer*>(
+        m_device.allocate(num_reducers * sizeof(BlockReducer)));
+
+    InputDimensions input_tensor_dims(m_impl.dimensions());
+    for (Index output_outer_index = 0;
+         output_outer_index < output_block_outer_dim_size;
+         ++output_outer_index) {
+      for (Index output_shard_index = 0; output_shard_index < num_output_shards;
+           ++output_shard_index) {
+        // Initialize 'tensor_slice_extents' for this output shard.
+        DSizes<Index, NumInputDims> tensor_slice_extents(input_slice_sizes);
+        for (int i = 0; i < NumInputDims; ++i) {
+          if (i == first_preserved_dim_input_index) {
+            // Clip first preserved dim size to output shard size.
+            tensor_slice_extents[i] = numext::mini(
+                output_shard_size,
+                input_slice_sizes[i] - (tensor_slice_offsets[i] -
+                                        first_preserved_dim_offset_start));
+
+          } else if (!m_reduced[i]) {
+            // Clip outer preserved dims to size 1, so that we reduce a
+            // contiguous set of output coefficients.
+            tensor_slice_extents[i] = 1;
+          }
+        }
+
+        // Intialize output coefficient reducers.
+        for (int i = 0; i < num_reducers; ++i) {
+          new (&reducers[i]) BlockReducer(m_reducer);
+        }
+
+        using TensorSliceBlockMapper =
+            internal::TensorSliceBlockMapper<ScalarNoConst, Index, NumInputDims,
+                                             Layout>;
+
+        // TODO(andydavis) Consider removing 'input_block_stride_order' if we
+        // find that scattered reads are not worth supporting in
+        // TensorSliceBlockMapper.
+        TensorSliceBlockMapper block_mapper(
+            input_tensor_dims, tensor_slice_offsets, tensor_slice_extents,
+            target_input_block_sizes, DimensionList<Index, NumInputDims>());
+
+        const Index num_outputs_to_update =
+            tensor_slice_extents[first_preserved_dim_input_index];
+        const Index preserved_dim_vector_reducer_count =
+            (inner_most_dim_preserved && packet_reductions_enabled)
+                ? num_outputs_to_update / PacketSize
+                : 0;
+        const Index preserved_dim_vector_coeff_count =
+            inner_most_dim_preserved
+                ? preserved_dim_vector_reducer_count * PacketSize
+                : 0;
+        const Index preserved_dim_reducer_limit =
+            (inner_most_dim_preserved && packet_reductions_enabled)
+                ? (preserved_dim_vector_reducer_count +
+                   num_outputs_to_update % PacketSize)
+                : num_outputs_to_update;
+
+        const Index total_block_count = block_mapper.total_block_count();
+        for (Index b = 0; b < total_block_count; ++b) {
+          InputTensorBlock input_block =
+              block_mapper.GetBlockForIndex(b, input_block_data);
+          // Read.
+          m_impl.block(&input_block);
+
+          Index num_values_to_reduce = 1;
+          for (Index i = 0; i < NumInputDims; ++i) {
+            if (m_reduced[i]) {
+              num_values_to_reduce *= input_block.block_sizes()[i];
+            }
+          }
+          // Reduce.
+          if (inner_most_dim_preserved) {
+            const Index input_outer_dim_size =
+                input_block.block_sizes().TotalSize() / num_outputs_to_update;
+            for (Index input_outer_dim_index = 0;
+                 input_outer_dim_index < input_outer_dim_size;
+                 ++input_outer_dim_index) {
+              const Index input_outer_dim_base =
+                  input_outer_dim_index * num_outputs_to_update;
+              for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
+                reducers[i].Reduce(input_outer_dim_base + i * PacketSize,
+                                   PacketSize, input_block.data());
+              }
+              const Index scalar_reducer_base =
+                  input_outer_dim_base + preserved_dim_vector_coeff_count;
+              for (Index i = preserved_dim_vector_reducer_count;
+                   i < preserved_dim_reducer_limit; ++i) {
+                reducers[i].Reduce(scalar_reducer_base + i -
+                                       preserved_dim_vector_reducer_count,
+                                   1, input_block.data());
+              }
+            }
+          } else {
+            for (Index i = 0; i < num_outputs_to_update; ++i) {
+              reducers[i].Reduce(i * num_values_to_reduce, num_values_to_reduce,
+                                 input_block.data());
+            }
+          }
+        }
+
+        // Finalize all reducers for this output shard.
+        const Index output_base_index =
+            output_outer_index * output_block_inner_dim_size +
+            output_shard_index * output_shard_size;
+        if (inner_most_dim_preserved) {
+          EIGEN_ALIGN_MAX
+              typename internal::remove_const<CoeffReturnType>::type
+                  values[PacketSize];
+          for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
+            const Index reducer_base = output_base_index + i * PacketSize;
+            internal::pstore<CoeffReturnType, PacketReturnType>(
+                values, reducers[i].FinalizePacket());
+            for (Index j = 0; j < PacketSize; ++j) {
+              output_block->data()[reducer_base + j] = values[j];
+            }
+          }
+          const Index scalar_reducer_base =
+              output_base_index + preserved_dim_vector_coeff_count;
+
+          for (Index i = preserved_dim_vector_reducer_count;
+               i < preserved_dim_reducer_limit; ++i) {
+            output_block->data()[scalar_reducer_base + i -
+                                 preserved_dim_vector_reducer_count] =
+                reducers[i].Finalize();
+          }
+        } else {
+          for (int i = 0; i < num_outputs_to_update; ++i) {
+            output_block->data()[output_base_index + i] =
+                reducers[i].Finalize();
+          }
+        }
+
+        // Update 'tensor_slice_offsets' by num outputs for this output shard.
+        tensor_slice_offsets[first_preserved_dim_input_index] +=
+            num_outputs_to_update;
+      }
+      // Update slice offset for inner preserved dim.
+      tensor_slice_offsets[first_preserved_dim_input_index] -=
+          output_block_inner_dim_size;
+      // Update slice offsets for remaining output dims.
+      for (int i = 0; i < NumOutputDims - 1; ++i) {
+        BlockIteratorState& b = block_iter_state[i];
+        if (++b.output_count < b.output_size) {
+          ++tensor_slice_offsets[b.input_dim];
+          break;
+        }
+        b.output_count = 0;
+        tensor_slice_offsets[b.input_dim] -= b.output_size - 1;
+      }
+    }
+
+    // Free memory.
+    m_device.deallocate(input_block_data);
+    m_device.deallocate(reducers);
+  }
+
   EIGEN_DEVICE_FUNC typename MakePointer_<CoeffReturnType>::Type data() const { return m_result; }
 
 #if defined(EIGEN_USE_SYCL)
@@ -722,6 +1064,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
 
   template <typename S, typename O, typename D> friend struct internal::InnerReducer;
 
+  struct BlockIteratorState {
+    Index input_dim;
+    Index output_size;
+    Index output_count;
+  };
+
   // Returns the Index in the input tensor of the first value that needs to be
   // used to compute the reduction at output index "index".
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
@@ -764,16 +1112,90 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
     return startInput;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void GetInputCoordsForOutputIndex(
+      Index index,
+      DSizes<Index, NumInputDims>* coords) const {
+    for (int i = 0; i < NumInputDims; ++i) {
+      (*coords)[i] = 0;
+    }
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumOutputDims - 1; i > 0; --i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        (*coords)[m_output_to_input_dim_map[i]] = idx;
+        index -= idx * m_outputStrides[i];
+      }
+      (*coords)[m_output_to_input_dim_map[0]] = index;
+    } else {
+      for (int i = 0; i < NumOutputDims - 1; ++i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        (*coords)[m_output_to_input_dim_map[i]] = idx;
+        index -= idx * m_outputStrides[i];
+      }
+      (*coords)[m_output_to_input_dim_map[NumOutputDims-1]] = index;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void CalculateTargetInputBlockShape(
+      const Index max_coeff_count,
+      const DSizes<Index, NumInputDims>& input_slice_sizes,
+      DSizes<Index, NumInputDims>* target_input_block_sizes) const {
+    typedef typename internal::packet_traits<Scalar>::type Packet;
+    typedef internal::BlockReducer<Self, Op> BlockReducer;
+    // TODO(andydavis) Compute reducer overhead correctly for the case where
+    // we are preserving the inner most dimension, and a single reducer
+    // reduces a packet's worth of output coefficients.
+    const Index reducer_overhead = sizeof(BlockReducer) / sizeof(Scalar);
+
+    Index coeff_to_allocate = max_coeff_count;
+    bool first_preserved_dim_allocated = false;
+    bool first_reduced_dim_allocated = false;
+    for (int i = 0; i < NumInputDims; ++i) {
+      const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+                          ? i
+                          : NumInputDims - i - 1;
+      (*target_input_block_sizes)[dim] = 1;
+      if (m_reduced[dim]) {
+        // TODO(andydavis) Consider allocating to multiple reduced dimensions.
+        // Watch out for cases where reduced dimensions are not contiguous,
+        // which induces scattered reads.
+        if (!first_reduced_dim_allocated) {
+          (*target_input_block_sizes)[dim] =
+              numext::mini(input_slice_sizes[dim], coeff_to_allocate);
+          coeff_to_allocate /= (*target_input_block_sizes)[dim];
+          first_reduced_dim_allocated = true;
+        }
+      } else if (!first_preserved_dim_allocated) {
+        // TODO(andydavis) Include output block size in this L1 working set
+        // calculation.
+        const Index allocated = max_coeff_count - coeff_to_allocate;
+        const Index alloc_size = numext::maxi(
+            static_cast<Index>(1), coeff_to_allocate / reducer_overhead);
+        (*target_input_block_sizes)[dim] =
+            numext::mini(input_slice_sizes[dim], alloc_size);
+        coeff_to_allocate = numext::maxi(
+            static_cast<Index>(1),
+            coeff_to_allocate /
+                ((*target_input_block_sizes)[dim] * reducer_overhead));
+        first_preserved_dim_allocated = true;
+      }
+    }
+  }
+
   // Bitmap indicating if an input dimension is reduced or not.
   array<bool, NumInputDims> m_reduced;
   // Dimensions of the output of the operation.
   Dimensions m_dimensions;
   // Precomputed strides for the output tensor.
   array<Index, NumOutputDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
   // Subset of strides of the input tensor for the non-reduced dimensions.
   // Indexed by output dimensions.
   static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
   array<Index, NumPreservedStrides> m_preservedStrides;
+  // Map from output to input dimension index.
+  array<Index, NumOutputDims> m_output_to_input_dim_map;
+  // How many values go into each reduction
+  Index m_numValuesToReduce;
 
   // Subset of strides of the input tensor for the reduced dimensions.
   // Indexed by reduced dimensions.
@@ -782,6 +1204,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   // Indexed by reduced dimensions.
   array<Index, NumReducedDims> m_reducedDims;
 
+  // Block size for tiled (aka TensorBlock) evaluation.
+  Index m_block_total_size_max;
+
   // Evaluator for the input expression.
   TensorEvaluator<ArgType, Device> m_impl;
 
-- 
cgit v1.2.3


From 64abdf1d7eb17174f571751346dd0cbadcf3bc52 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 1 Aug 2018 12:35:19 -0700
Subject: Fix typo + get rid of redundant member variables for block sizes

---
 .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h        |  6 +++---
 unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h    | 17 ++++++++---------
 unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h  | 11 +++++------
 unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h    | 12 +++---------
 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h   | 10 +++-------
 unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h   | 18 ++++++++++--------
 6 files changed, 32 insertions(+), 42 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index b6dbe5a22..cca14aafd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -120,7 +120,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   // Block based access to the XprType (input) tensor.
   using TensorBlock                = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
   using TensorBlockReader          = internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>;
-  // We do block based broadcasting using a a trick with 2x tensor rank and 0
+  // We do block based broadcasting using a trick with 2x tensor rank and 0
   // strides. See block method implementation for details.
   using BroadcastDimensions        = DSizes<Index, 2 * NumDims>;
   using BroadcastTensorBlock       = internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>;
@@ -589,8 +589,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
     // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
     // tensors. But this might need further tuning.
-    Index l1_cache_scalars = m_device.firstLevelCacheSize() / sizeof(Scalar);
-    Index block_total_size_max = numext::maxi(Index(1), l1_cache_scalars);
+    auto block_total_size_max = numext::maxi<Eigen::Index>(
+        1, m_device.firstLevelCacheSize() / sizeof(Scalar));
 
     resources->push_back(internal::TensorOpResourceRequirements(
         internal::TensorBlockShapeType::kSkewedInnerDims,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 7579ab507..aca2ead12 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -202,9 +202,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
           m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
         }
       }
-
-      m_block_total_size_max =
-          numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
     }
   }
 
@@ -290,9 +287,11 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    auto block_total_size_max = numext::maxi<Eigen::Index>(
+        1, m_device.lastLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
         internal::TensorBlockShapeType::kSkewedInnerDims,
-        m_block_total_size_max));
+        block_total_size_max));
     m_impl.getResourceRequirements(resources);
   }
 
@@ -370,13 +369,14 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   {
     Index inputIndex;
     if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
-  (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+        (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims - 1)) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(m_stride == 1);
       inputIndex = index * m_inputStride + m_inputOffset;
-    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) ||
-         (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
-      // m_stride is aways greater than index, so let's avoid the integer division.
+    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
+               (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+      // m_stride is aways greater than index, so let's avoid the integer
+      // division.
       eigen_assert(m_stride > index);
       inputIndex = index + m_inputOffset;
     } else {
@@ -392,7 +392,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   Index m_stride;
   Index m_inputOffset;
   Index m_inputStride;
-  Index m_block_total_size_max;
   DSizes<Index, NumInputDims> m_inputStrides;
   TensorEvaluator<ArgType, Device> m_impl;
   const internal::DimensionId<DimId> m_dim;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 39759b6c3..a8247be90 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -259,7 +259,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   #else
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
   #endif
-      : m_impl(op.expression(), device)
+      : m_device(device), m_impl(op.expression(), device)
 #ifdef EIGEN_USE_SYCL
       , m_op(op)
 #endif
@@ -404,9 +404,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
     } else {
       m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
     }
-
-    m_block_total_size_max =
-        numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -551,9 +548,11 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    auto block_total_size_max = numext::maxi<Eigen::Index>(
+        1, m_device.lastLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
         internal::TensorBlockShapeType::kSkewedInnerDims,
-        m_block_total_size_max));
+        block_total_size_max));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
@@ -743,8 +742,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   internal::TensorIntDivisor<Index> m_fastOutputDepth;
 
   Scalar m_paddingValue;
-  Index m_block_total_size_max;
 
+  const Device& m_device;
   TensorEvaluator<ArgType, Device> m_impl;
   #ifdef EIGEN_USE_SYCL
   // Required for SYCL in order to construct the expression tree on the device
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 2630311b8..6ddded0bd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -560,9 +560,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
         m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
       }
     }
-
-    m_block_total_size_max =
-        numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -672,9 +669,11 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    auto block_total_size_max = numext::maxi<Eigen::Index>(
+        1, m_device.lastLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
         internal::TensorBlockShapeType::kSkewedInnerDims,
-        m_block_total_size_max));
+        block_total_size_max));
     m_impl.getResourceRequirements(resources);
   }
 
@@ -761,7 +760,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   Dimensions m_dimensions;
   bool m_is_identity;
   const StartIndices m_offsets;
-  Index m_block_total_size_max;
 };
 
 
@@ -1047,9 +1045,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
         m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
       }
     }
-    m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
-                                          device.lastLevelCacheSize() /
-                                          sizeof(Scalar));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -1128,7 +1123,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
   DSizes<Index, NumDims> m_dimensions;
   DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
   const Strides m_strides;
-  std::size_t m_block_total_size_max;
   //use by sycl
   const StartIndices m_exprStartIndices;
   //use by sycl
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index c41783106..73675e7dd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -572,9 +572,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
             : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
                   ? m_preservedStrides[0]
                   : m_preservedStrides[NumOutputDims - 1];
-
-    m_block_total_size_max =
-        numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -771,9 +768,11 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    auto block_total_size_max = numext::maxi<Eigen::Index>(
+        1, m_device.lastLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
         internal::TensorBlockShapeType::kSkewedInnerDims,
-        m_block_total_size_max));
+        block_total_size_max));
     m_impl.getResourceRequirements(resources);
   }
 
@@ -1204,9 +1203,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   // Indexed by reduced dimensions.
   array<Index, NumReducedDims> m_reducedDims;
 
-  // Block size for tiled (aka TensorBlock) evaluation.
-  Index m_block_total_size_max;
-
   // Evaluator for the input expression.
   TensorEvaluator<ArgType, Device> m_impl;
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 77f47bf64..f94c1380d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -124,8 +124,11 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   using TensorBlock       = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
   using TensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_shuffle(op.shufflePermutation())
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+                                                        const Device& device)
+      : m_device(device),
+        m_impl(op.expression(), device),
+        m_shuffle(op.shufflePermutation())
   {
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     const Shuffle& shuffle = op.shufflePermutation();
@@ -162,9 +165,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
     for (int i = 0; i < NumDims; ++i) {
       m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
     }
-
-    m_block_total_size_max =
-        numext::maxi<Index>(1, device.firstLevelCacheSize() / sizeof(Scalar));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -226,9 +226,10 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    auto block_total_size_max = numext::maxi<Eigen::Index>(
+        1, m_device.firstLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
-        internal::TensorBlockShapeType::kUniformAllDims,
-        m_block_total_size_max));
+        internal::TensorBlockShapeType::kUniformAllDims, block_total_size_max));
     m_impl.getResourceRequirements(resources);
   }
 
@@ -384,7 +385,8 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
   array<Index, NumDims> m_unshuffledInputStrides;
-  Index m_block_total_size_max;
+
+  const Device& m_device;
   TensorEvaluator<ArgType, Device> m_impl;
   /// required by sycl
   Shuffle m_shuffle;
-- 
cgit v1.2.3


From 1b0373ae10687ecc51ad9a0bfd46aa4ee116ade1 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 1 Aug 2018 15:55:46 -0700
Subject: Replace all using declarations with typedefs in Tensor ops

---
 unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h   | 20 +++++------
 .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h    | 17 +++++----
 .../Eigen/CXX11/src/Tensor/TensorChipping.h        | 16 +++++----
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h        | 40 ++++++++++++----------
 .../Eigen/CXX11/src/Tensor/TensorImagePatch.h      |  7 ++--
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h        | 22 +++++++-----
 .../Eigen/CXX11/src/Tensor/TensorReduction.h       | 14 ++++----
 .../Eigen/CXX11/src/Tensor/TensorShuffling.h       | 16 +++++----
 8 files changed, 88 insertions(+), 64 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index 33c4ef5b7..1db8d6124 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -152,11 +152,11 @@ struct TensorBlockCopyOp {
     const Scalar* src_base = &src_data[src_index];
     Scalar* dst_base = &dst_data[dst_index];
 
-    using Src = const Eigen::Array<Scalar, Dynamic, 1>;
-    using Dst = Eigen::Array<Scalar, Dynamic, 1>;
+    typedef const Eigen::Array<Scalar, Dynamic, 1> Src;
+    typedef Eigen::Array<Scalar, Dynamic, 1> Dst;
 
-    using SrcMap = Eigen::Map<Src, 0, InnerStride<>>;
-    using DstMap = Eigen::Map<Dst, 0, InnerStride<>>;
+    typedef Eigen::Map<Src, 0, InnerStride<>> SrcMap;
+    typedef Eigen::Map<Dst, 0, InnerStride<>> DstMap;
 
     const SrcMap src(src_base, num_coeff_to_copy, InnerStride<>(src_stride));
     DstMap dst(dst_base, num_coeff_to_copy, InnerStride<>(dst_stride));
@@ -401,13 +401,13 @@ struct TensorBlockCwiseBinaryOp {
       const StorageIndex left_stride, const LeftScalar* left_data,
       const StorageIndex right_index, const StorageIndex right_stride,
       const RightScalar* right_data) {
-    using Lhs = const Eigen::Array<LeftScalar, Dynamic, 1>;
-    using Rhs = const Eigen::Array<RightScalar, Dynamic, 1>;
-    using Out = Eigen::Array<OutputScalar, Dynamic, 1>;
+    typedef const Eigen::Array<LeftScalar, Dynamic, 1> Lhs;
+    typedef const Eigen::Array<RightScalar, Dynamic, 1> Rhs;
+    typedef Eigen::Array<OutputScalar, Dynamic, 1> Out;
 
-    using LhsMap = Eigen::Map<Lhs, 0, InnerStride<>>;
-    using RhsMap = Eigen::Map<Rhs, 0, InnerStride<>>;
-    using OutMap = Eigen::Map<Out, 0, InnerStride<>>;
+    typedef Eigen::Map<Lhs, 0, InnerStride<>> LhsMap;
+    typedef Eigen::Map<Rhs, 0, InnerStride<>> RhsMap;
+    typedef Eigen::Map<Out, 0, InnerStride<>> OutMap;
 
     const LeftScalar* lhs_base = &left_data[left_index];
     const RightScalar* rhs_base = &right_data[right_index];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index cca14aafd..a4d750885 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -115,16 +115,21 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     RawAccess    = false
   };
 
-  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
   // Block based access to the XprType (input) tensor.
-  using TensorBlock                = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
-  using TensorBlockReader          = internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlock;
+  typedef internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlockReader;
+
   // We do block based broadcasting using a trick with 2x tensor rank and 0
   // strides. See block method implementation for details.
-  using BroadcastDimensions        = DSizes<Index, 2 * NumDims>;
-  using BroadcastTensorBlock       = internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>;
-  using BroadcastTensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, 2 * NumDims, Layout>;
+  typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
+  typedef internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>
+      BroadcastTensorBlock;
+  typedef internal::TensorBlockReader<ScalarNoConst, Index, 2 * NumDims, Layout>
+      BroadcastTensorBlockReader;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
                                                         const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index aca2ead12..b4c4162ef 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -152,10 +152,12 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     RawAccess    = false
   };
 
-  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  using InputTensorBlock  = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
-  using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
+      InputTensorBlock;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+      OutputTensorBlock;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset())
@@ -426,10 +428,12 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
     RawAccess    = false
   };
 
-  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  using InputTensorBlock  = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
-  using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
+      InputTensorBlock;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+      OutputTensorBlock;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 17008917a..f11241f83 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -36,7 +36,7 @@ template <typename Expression, typename Device, bool Vectorizable,
           bool Tileable>
 class TensorExecutor {
  public:
-  using StorageIndex = typename Expression::Index;
+  typedef typename Expression::Index StorageIndex;
 
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
@@ -60,7 +60,7 @@ template <typename Expression>
 class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
                      /*Tileable*/ false> {
  public:
-  using StorageIndex = typename Expression::Index;
+  typedef typename Expression::Index StorageIndex;
 
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
@@ -102,21 +102,22 @@ template <typename Expression, bool Vectorizable>
 class TensorExecutor<Expression, DefaultDevice, Vectorizable,
                      /*Tileable*/ true> {
  public:
-  using Scalar = typename traits<Expression>::Scalar;
-  using ScalarNoConst = typename remove_const<Scalar>::type;
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef typename remove_const<Scalar>::type ScalarNoConst;
 
-  using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
-  using StorageIndex = typename traits<Expression>::Index;
+  typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
+  typedef typename traits<Expression>::Index StorageIndex;
 
   static const int NumDims = traits<Expression>::NumDimensions;
 
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
                          const DefaultDevice& device = DefaultDevice()) {
-    using TensorBlock =
-        TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
-    using TensorBlockMapper = TensorBlockMapper<ScalarNoConst, StorageIndex,
-                                                NumDims, Evaluator::Layout>;
+    typedef TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>
+        TensorBlock;
+    typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims,
+                              Evaluator::Layout>
+        TensorBlockMapper;
 
     Evaluator evaluator(expr, device);
     Index total_size = array_prod(evaluator.dimensions());
@@ -221,7 +222,7 @@ struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
 template <typename Expression, bool Vectorizable, bool Tileable>
 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
  public:
-  using StorageIndex = typename Expression::Index;
+  typedef typename Expression::Index StorageIndex;
 
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
                          const ThreadPoolDevice& device) {
@@ -249,20 +250,21 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
 template <typename Expression, bool Vectorizable>
 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
  public:
-  using Scalar = typename traits<Expression>::Scalar;
-  using ScalarNoConst = typename remove_const<Scalar>::type;
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef typename remove_const<Scalar>::type ScalarNoConst;
 
-  using Evaluator = TensorEvaluator<Expression, ThreadPoolDevice>;
-  using StorageIndex = typename traits<Expression>::Index;
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+  typedef typename traits<Expression>::Index StorageIndex;
 
   static const int NumDims = traits<Expression>::NumDimensions;
 
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
                          const ThreadPoolDevice& device) {
-    using TensorBlock =
-        TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
-    using TensorBlockMapper =
-        TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
+    typedef TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>
+        TensorBlock;
+    typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims,
+                              Evaluator::Layout>
+        TensorBlockMapper;
 
     Evaluator evaluator(expr, device);
     StorageIndex total_size = array_prod(evaluator.dimensions());
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index a8247be90..2b2f4a650 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -252,7 +252,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
     RawAccess    = false
   };
 
-  using OutputTensorBlock = internal::TensorBlock<Scalar, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<Scalar, Index, NumDims, Layout>
+      OutputTensorBlock;
 
 #ifdef __SYCL_DEVICE_ONLY__
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device)
@@ -557,8 +558,8 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
       OutputTensorBlock* output_block) const {
-    using ImagePatchCopyOp = internal::ImagePatchCopyOp<Self, PacketAccess>;
-    using ImagePatchPaddingOp = internal::ImagePatchPaddingOp<Self>;
+    typedef internal::ImagePatchCopyOp<Self, PacketAccess> ImagePatchCopyOp;
+    typedef internal::ImagePatchPaddingOp<Self> ImagePatchPaddingOp;
 
     // Calculate loop limits and various input/output dim sizes.
     const DSizes<Index, NumDims>& block_sizes = output_block->block_sizes();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 6ddded0bd..d5b0c1237 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -123,11 +123,15 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     RawAccess    = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
-  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  using InputTensorBlock        = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
-  using OutputTensorBlock       = internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>;
-  using OutputTensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, NumOutputDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
+      InputTensorBlock;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>
+      OutputTensorBlock;
+  typedef internal::TensorBlockReader<ScalarNoConst, Index, NumOutputDims,
+                                      Layout>
+      OutputTensorBlockReader;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dimensions(op.dimensions())
@@ -512,9 +516,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     RawAccess    = false
   };
 
-  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  using TensorBlock =  internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlock;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
@@ -787,9 +792,10 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
     RawAccess    = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
   };
 
-  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  using TensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlock;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 73675e7dd..80f179ba4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -483,10 +483,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
     RawAccess    = false
   };
 
-  using ScalarNoConst =  typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>;
-  using InputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>
+      OutputTensorBlock;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
+      InputTensorBlock;
 
   static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
   static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
@@ -901,9 +903,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
           new (&reducers[i]) BlockReducer(m_reducer);
         }
 
-        using TensorSliceBlockMapper =
-            internal::TensorSliceBlockMapper<ScalarNoConst, Index, NumInputDims,
-                                             Layout>;
+        typedef internal::TensorSliceBlockMapper<ScalarNoConst, Index,
+                                                 NumInputDims, Layout>
+            TensorSliceBlockMapper;
 
         // TODO(andydavis) Consider removing 'input_block_stride_order' if we
         // find that scattered reads are not worth supporting in
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index f94c1380d..fbe69aabc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -119,10 +119,12 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
     RawAccess    = false
   };
 
-  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  using TensorBlock       = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
-  using TensorBlockReader = internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlock;
+  typedef internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlockReader;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
                                                         const Device& device)
@@ -417,10 +419,12 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
     RawAccess    = false
   };
 
-  using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
-  using TensorBlock       = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
-  using TensorBlockWriter = internal::TensorBlockWriter<ScalarNoConst, Index, NumDims, Layout>;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlock;
+  typedef internal::TensorBlockWriter<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlockWriter;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : Base(op, device)
-- 
cgit v1.2.3


From f2209d06e428e0691de71f30fc2db4cb29191cd2 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 10 Aug 2018 16:53:36 -0700
Subject: Add block evaluationto CwiseUnaryOp and add PreferBlockAccess enum to
 all evaluators

---
 unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h  |   2 +
 .../Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h      |   1 +
 unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h  |  18 +--
 unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h   | 141 +++++++++++++++++++++
 .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h    |  11 +-
 .../Eigen/CXX11/src/Tensor/TensorChipping.h        |  13 +-
 .../Eigen/CXX11/src/Tensor/TensorConcatenation.h   |   2 +
 .../Eigen/CXX11/src/Tensor/TensorContraction.h     |   1 +
 .../Eigen/CXX11/src/Tensor/TensorConversion.h      |   1 +
 .../Eigen/CXX11/src/Tensor/TensorConvolution.h     |   2 +
 .../Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h |   1 +
 .../Eigen/CXX11/src/Tensor/TensorCustomOp.h        |   2 +
 unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h  |   1 +
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h       |  76 ++++++++---
 unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h     |   1 +
 .../Eigen/CXX11/src/Tensor/TensorFixedSize.h       |   1 +
 .../Eigen/CXX11/src/Tensor/TensorForcedEval.h      |   1 +
 .../CXX11/src/Tensor/TensorForwardDeclarations.h   |   7 +-
 .../Eigen/CXX11/src/Tensor/TensorGenerator.h       |   1 +
 .../Eigen/CXX11/src/Tensor/TensorImagePatch.h      |  13 +-
 .../Eigen/CXX11/src/Tensor/TensorInflation.h       |   1 +
 .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h      |   2 +
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h        |  46 ++++---
 unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h |   1 +
 unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h   |   1 +
 .../Eigen/CXX11/src/Tensor/TensorReduction.h       |  13 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorRef.h     |   3 +
 unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h |   2 +
 unsupported/Eigen/CXX11/src/Tensor/TensorScan.h    |   1 +
 .../Eigen/CXX11/src/Tensor/TensorShuffling.h       |  24 ++--
 .../Eigen/CXX11/src/Tensor/TensorStriding.h        |   2 +
 unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h   |   1 +
 .../Eigen/CXX11/src/Tensor/TensorVolumePatch.h     |   1 +
 unsupported/test/cxx11_tensor_block_access.cpp     | 111 ++++++++++++++++
 unsupported/test/cxx11_tensor_executor.cpp         |  31 +++++
 35 files changed, 454 insertions(+), 82 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index c0f33ba2d..ea3ab329d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -87,6 +87,7 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
     PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -220,6 +221,7 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
     PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
index 442639868..5110e99ee 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
@@ -109,6 +109,7 @@ struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, Sy
     IsAligned =  false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout,
     CoordAccess = false,
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index f1f877c16..9ec1ec726 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -102,14 +102,16 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
   static const int NumDims = XprType::NumDims;
 
   enum {
-    IsAligned    = TensorEvaluator<LeftArgType, Device>::IsAligned &
-                   TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
-                   TensorEvaluator<RightArgType, Device>::PacketAccess,
-    BlockAccess  = TensorEvaluator<LeftArgType, Device>::BlockAccess &
-                   TensorEvaluator<RightArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess    = TensorEvaluator<LeftArgType, Device>::RawAccess
+    IsAligned         = TensorEvaluator<LeftArgType, Device>::IsAligned &
+                        TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &
+                        TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<LeftArgType, Device>::BlockAccess &
+                        TensorEvaluator<RightArgType, Device>::BlockAccess,
+    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
+                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess         = TensorEvaluator<LeftArgType, Device>::RawAccess
   };
 
   typedef typename internal::TensorBlock<
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index 877603421..4a3e1ac17 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -381,6 +381,147 @@ class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims,
   }
 };
 
+/**
+ * \class TensorBlockCwiseUnaryOp
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Carries out a cwise binary op on a number of coefficients.
+ *
+ * This class reads strided input from the argument, and writes the
+ * result of the cwise unary op to the strided output array.
+ *
+ */
+struct TensorBlockCwiseUnaryOp {
+  template <typename StorageIndex, typename UnaryFunctor,
+            typename OutputScalar, typename InputScalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const UnaryFunctor& functor, const StorageIndex num_coeff,
+      const StorageIndex output_index, const StorageIndex output_stride,
+      OutputScalar* output_data, const StorageIndex input_index,
+      const StorageIndex input_stride, const InputScalar* input_data) {
+    typedef const Eigen::Array<InputScalar, Dynamic, 1> Input;
+    typedef Eigen::Array<OutputScalar, Dynamic, 1> Output;
+
+    typedef Eigen::Map<Input, 0, InnerStride<>> InputMap;
+    typedef Eigen::Map<Output, 0, InnerStride<>> OutputMap;
+
+    const InputScalar* input_base = &input_data[input_index];
+    OutputScalar* output_base = &output_data[output_index];
+
+    const InputMap input(input_base, num_coeff, InnerStride<>(input_stride));
+    OutputMap output(output_base, num_coeff, InnerStride<>(output_stride));
+
+    output = Eigen::CwiseUnaryOp<UnaryFunctor, InputMap>(input, functor);
+  }
+};
+
+/**
+ * \class TensorBlockCwiseUnaryIO
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor block IO class for carrying out cwise unary ops.
+ *
+ * This class carries out the unary op on given blocks.
+ */
+template <typename UnaryFunctor, typename StorageIndex, typename OutputScalar,
+          int NumDims, int Layout>
+struct TensorBlockCwiseUnaryIO {
+  typedef typename internal::TensorBlock<OutputScalar, StorageIndex, NumDims,
+                                         Layout>::Dimensions Dimensions;
+
+  struct BlockIteratorState {
+    StorageIndex output_stride, output_span;
+    StorageIndex input_stride, input_span;
+    StorageIndex size, count;
+  };
+
+  template <typename InputScalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const UnaryFunctor& functor, const Dimensions& block_sizes,
+      const Dimensions& block_strides, OutputScalar* output_data,
+      const array<StorageIndex, NumDims>& input_strides,
+      const InputScalar* input_data) {
+    // Find the innermost dimension whose size is not 1. This is the effective
+    // inner dim. If all dimensions are of size 1, fallback to using the actual
+    // innermost dim to avoid out-of-bound access.
+    int num_size_one_inner_dims = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = cond<Layout>()(i, NumDims - i - 1);
+      if (block_sizes[dim] != 1) {
+        num_size_one_inner_dims = i;
+        break;
+      }
+    }
+    // Calculate strides and dimensions.
+    const int inner_dim =
+        NumDims == 0 ? 1
+                     : cond<Layout>()(num_size_one_inner_dims,
+                                      NumDims - num_size_one_inner_dims - 1);
+    StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
+    for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
+      const int dim = cond<Layout>()(i, NumDims - i - 1);
+      // Merge multiple inner dims into one for larger inner dim size (i.e.
+      // fewer calls to TensorBlockCwiseUnaryOp::Run()).
+      if (inner_dim_size == block_strides[dim] &&
+          block_strides[dim] == input_strides[dim]) {
+        inner_dim_size *= block_sizes[dim];
+        ++num_size_one_inner_dims;
+      } else {
+        break;
+      }
+    }
+
+    StorageIndex output_index = 0, input_index = 0;
+
+    const StorageIndex output_stride =
+        NumDims == 0 ? 1 : block_strides[inner_dim];
+    const StorageIndex input_stride =
+        NumDims == 0 ? 1 : input_strides[inner_dim];
+
+    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
+    array<BlockIteratorState, at_least_1_dim> block_iter_state;
+
+    // Initialize block iterator state. Squeeze away any dimension of size 1.
+    int num_squeezed_dims = 0;
+    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
+      const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
+      const StorageIndex size = block_sizes[dim];
+      if (size == 1) {
+        continue;
+      }
+      BlockIteratorState& state = block_iter_state[num_squeezed_dims];
+      state.output_stride = block_strides[dim];
+      state.input_stride = input_strides[dim];
+      state.size = size;
+      state.output_span = state.output_stride * (size - 1);
+      state.input_span = state.input_stride * (size - 1);
+      state.count = 0;
+      ++num_squeezed_dims;
+    }
+
+    // Compute cwise unary op.
+    const StorageIndex block_total_size =
+        NumDims == 0 ? 1 : block_sizes.TotalSize();
+    for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
+      TensorBlockCwiseUnaryOp::Run(functor, inner_dim_size, output_index,
+                                   output_stride, output_data, input_index,
+                                   input_stride, input_data);
+      // Update index.
+      for (int j = 0; j < num_squeezed_dims; ++j) {
+        auto& state = block_iter_state[j];
+        if (++state.count < state.size) {
+          output_index += state.output_stride;
+          input_index += state.input_stride;
+          break;
+        }
+        state.count = 0;
+        output_index -= state.output_span;
+        input_index -= state.input_span;
+      }
+    }
+  }
+};
+
 /**
  * \class TensorBlockCwiseBinaryOp
  * \ingroup CXX11_Tensor_Module
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index a4d750885..5e812b04d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -108,11 +108,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   bool isCopy= false, nByOne = false, oneByN = false;
 
   enum {
-    IsAligned    = true,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess    = false
+    IsAligned         = true,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = false
   };
 
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index b4c4162ef..76fab39e2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -144,12 +144,13 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
     // slice offsets.
-    IsAligned    = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess  = false,  // to be implemented
-    RawAccess    = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 27c92d8f6..3863ee8c3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -123,6 +123,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
     IsAligned = false,
     PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
     RawAccess = false
   };
@@ -308,6 +309,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
     IsAligned = false,
     PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
     RawAccess = false
   };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index e604456e8..c459fc649 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -241,6 +241,7 @@ struct TensorContractionEvaluatorBase
     IsAligned = true,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index a7751eee1..1f613d3c7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -196,6 +196,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
     IsAligned = false,
     PacketAccess = true,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = false
   };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index a07e32db0..2d0e6599f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -308,6 +308,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -780,6 +781,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index d301d0c01..e79958fc9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -243,6 +243,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index 47b5a5a5e..d71b2e34b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -93,6 +93,7 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
     IsAligned = false,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<XprType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -255,6 +256,7 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
     IsAligned = false,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<LhsXprType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 256d499f2..554ee5f59 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -108,6 +108,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 028902fea..e30f9ad8e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -43,6 +43,7 @@ struct TensorEvaluator
     IsAligned = Derived::IsAligned,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
+    PreferBlockAccess = false,
     Layout = Derived::Layout,
     CoordAccess = NumCoords > 0,
     RawAccess = true
@@ -195,6 +196,7 @@ struct TensorEvaluator<const Derived, Device>
     IsAligned = Derived::IsAligned,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
+    PreferBlockAccess = false,
     Layout = Derived::Layout,
     CoordAccess = NumCoords > 0,
     RawAccess = true
@@ -288,6 +290,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
     IsAligned = true,
     PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -351,27 +354,34 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
   typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
 
   enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
-                   internal::functor_traits<UnaryOp>::PacketAccess,
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned          = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess       = TensorEvaluator<ArgType, Device>::PacketAccess &
+                         internal::functor_traits<UnaryOp>::PacketAccess,
+    BlockAccess        = TensorEvaluator<ArgType, Device>::PacketAccess,
+    PreferBlockAccess  = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    Layout             = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess        = false,  // to be implemented
+    RawAccess          = false
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-    : m_functor(op.functor()),
+    : m_device(device),
+      m_functor(op.functor()),
       m_argImpl(op.nestedExpression(), device)
   { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
 
+  static const int NumDims = internal::array_size<Dimensions>::value;
+  typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+      TensorBlock;
+
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
@@ -399,6 +409,29 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
         TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    m_argImpl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+      TensorBlock* output_block) const {
+    if (NumDims <= 0) {
+      output_block->data()[0] = coeff(0);
+      return;
+    }
+    internal::TensorBlockView<ArgType, Device> arg_block(m_device, m_argImpl,
+                                                         *output_block);
+    internal::TensorBlockCwiseUnaryIO<UnaryOp, Index, ScalarNoConst, NumDims,
+                                      Layout>::Run(m_functor,
+                                                   output_block->block_sizes(),
+                                                   output_block
+                                                       ->block_strides(),
+                                                   output_block->data(),
+                                                   arg_block.block_strides(),
+                                                   arg_block.data());
+  }
+
   EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
 
   /// required by sycl in order to extract the accessor
@@ -408,6 +441,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
 
 
  private:
+  const Device& m_device;
   const UnaryOp m_functor;
   TensorEvaluator<ArgType, Device> m_argImpl;
 };
@@ -421,16 +455,18 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
 
   enum {
-    IsAligned    = TensorEvaluator<LeftArgType, Device>::IsAligned &
-                   TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
-                   TensorEvaluator<RightArgType, Device>::PacketAccess &
-                   internal::functor_traits<BinaryOp>::PacketAccess,
-    BlockAccess  = TensorEvaluator<LeftArgType, Device>::BlockAccess &
-                   TensorEvaluator<RightArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<LeftArgType, Device>::Layout,
-    CoordAccess  = false,  // to be implemented
-    RawAccess    = false
+    IsAligned         = TensorEvaluator<LeftArgType, Device>::IsAligned &
+                        TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &
+                        TensorEvaluator<RightArgType, Device>::PacketAccess &
+                        internal::functor_traits<BinaryOp>::PacketAccess,
+    BlockAccess       = TensorEvaluator<LeftArgType, Device>::BlockAccess &
+                        TensorEvaluator<RightArgType, Device>::BlockAccess,
+    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
+                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
@@ -501,7 +537,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
       TensorBlock* output_block) const {
     if (NumDims <= 0) {
-      output_block->data()[0] = coeff(0);
+      output_block->data()[0] = coeff(Index(0));
       return;
     }
     internal::TensorBlockView<LeftArgType, Device> left_block(
@@ -543,6 +579,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
     PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
                    internal::functor_traits<TernaryOp>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<Arg1Type, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -648,6 +685,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
     PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
                     PacketType<Scalar, Device>::HasBlend,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<IfArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index d6ab4d997..480cf1f39 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -136,6 +136,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
     IsAligned = false,
     PacketAccess = true,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index 1342e47a6..71ba56773 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -42,6 +42,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
       IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
       PacketAccess = (internal::packet_traits<Scalar>::size > 1),
       BlockAccess = false,
+      PreferBlockAccess = false,
       Layout = Options_ & RowMajor ? RowMajor : ColMajor,
       CoordAccess = true,
       RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 2778bf5ec..edf9f85d8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -99,6 +99,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
     IsAligned = true,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = true
   };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 0dd524a30..b8a57ab99 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -132,7 +132,12 @@ struct IsVectorizable<GpuDevice, Expression> {
 
 template <typename Device, typename Expression>
 struct IsTileable {
-  static const bool value = TensorEvaluator<Expression, Device>::BlockAccess;
+  // Check that block evaluation is supported and it's a preferred option (at
+  // least one sub-expression has much faster block evaluation, e.g.
+  // broadcasting).
+  static const bool value =
+      TensorEvaluator<Expression, Device>::BlockAccess &
+      TensorEvaluator<Expression, Device>::PreferBlockAccess;
 };
 
 template <typename Expression, typename Device,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index 97c8d4a02..95c9e6aee 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -92,6 +92,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
     IsAligned = false,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 2b2f4a650..1826d7022 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -244,12 +244,13 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned    = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess  = true,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess  = false,
-    RawAccess    = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = true,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,
+    RawAccess         = false
   };
 
   typedef internal::TensorBlock<Scalar, Index, NumDims, Layout>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index 64f2ad81f..e28565009 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -91,6 +91,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index e3165fa10..998757d14 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -120,6 +120,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false,  // to be implemented
     RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@@ -183,6 +184,7 @@ template<typename ArgType, typename Device>
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false  // to be implemented
   };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index b25c1eabc..6f9294ccf 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -111,16 +111,17 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
   static const int NumInputDims  = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
 
   enum {
-    IsAligned    = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
     // TODO(andydavis, wuke) Enable BlockAccess for the general case when the
     // performance issue with block-based reshape is resolved.
-    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess &&
-                   TensorEvaluator<ArgType, Device>::RawAccess &&
-                   NumInputDims > 0 && NumOutputDims > 0,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess  = false,  // to be implemented
-    RawAccess    = TensorEvaluator<ArgType, Device>::RawAccess
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess &&
+                        TensorEvaluator<ArgType, Device>::RawAccess &&
+                        NumInputDims > 0 && NumOutputDims > 0,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
@@ -349,6 +350,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@@ -508,12 +510,13 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
     // slice offsets and sizes.
-    IsAligned    = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess  = false,
-    RawAccess    = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,
+    RawAccess         = false
   };
 
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
@@ -785,12 +788,13 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
   typedef Sizes Dimensions;
 
   enum {
-    IsAligned    = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess  = false,
-    RawAccess    = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,
+    RawAccess         = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
   };
 
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
@@ -972,6 +976,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = false
   };
@@ -1148,6 +1153,7 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index aa1db3c73..59c1704ed 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -97,6 +97,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     IsAligned = true,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = true,
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index a0a1ad8f4..4292fe0c2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 80f179ba4..3d534eaa2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -475,12 +475,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned    = false,
-    PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
-    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess  = false,  // to be implemented
-    RawAccess    = false
+    IsAligned         = false,
+    PacketAccess      = Self::InputPacketAccess && Op::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index a6cade50f..6e15e75f9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -137,6 +137,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
       IsAligned = false,
       PacketAccess = false,
       BlockAccess = false,
+      PreferBlockAccess = false,
       Layout = PlainObjectType::Layout,
       CoordAccess = false,  // to be implemented
       RawAccess = false
@@ -366,6 +367,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorRef<Derived>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -414,6 +416,7 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = false,
     RawAccess = false
   };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 9193bdd8e..b7fb969f3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -114,6 +114,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -255,6 +256,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index b1135f297..641366d9d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -97,6 +97,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
     IsAligned = false,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 98f125408..a5b541a68 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -111,12 +111,13 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned    = false,
-    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess  = false,  // to be implemented
-    RawAccess    = false
+    IsAligned         = false,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
@@ -412,11 +413,12 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned    = false,
-    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess    = false
+    IsAligned         = false,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = false
   };
 
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 4b69072f2..221dc96c9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -113,6 +113,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -275,6 +276,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
index ea53bb04b..9fc54a4c0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -96,6 +96,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 3c7d8bbc0..c1b7a58ca 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -200,6 +200,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp
index 6feeff231..746f62511 100644
--- a/unsupported/test/cxx11_tensor_block_access.cpp
+++ b/unsupported/test/cxx11_tensor_block_access.cpp
@@ -517,6 +517,114 @@ static void test_block_io_squeeze_ones() {
   }
 }
 
+template <typename T, int NumDims, int Layout>
+static void test_block_cwise_unary_io_basic() {
+  typedef internal::scalar_square_op<T> UnaryFunctor;
+  typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, T, NumDims,
+                                            Layout>
+      TensorBlockCwiseUnaryIO;
+
+  DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>();
+  DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes));
+
+  const auto total_size = block_sizes.TotalSize();
+
+  // Create a random input tensors.
+  T* input_data = GenerateRandomData<T>(total_size);
+
+  T* output_data = new T[total_size];
+  UnaryFunctor functor;
+  TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data,
+                               strides, input_data);
+  for (int i = 0; i < total_size; ++i) {
+    VERIFY_IS_EQUAL(output_data[i], functor(input_data[i]));
+  }
+
+  delete[] input_data;
+  delete[] output_data;
+}
+
+template <int Layout>
+static void test_block_cwise_unary_io_squeeze_ones() {
+  typedef internal::scalar_square_op<float> UnaryFunctor;
+  typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, float, 5,
+                                            Layout>
+      TensorBlockCwiseUnaryIO;
+
+  DSizes<Index, 5> block_sizes(1, 2, 1, 3, 1);
+  DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
+
+  const auto total_size = block_sizes.TotalSize();
+
+  // Create a random input tensors.
+  auto* input_data = GenerateRandomData<float>(total_size);
+
+  auto* output_data = new float[total_size];
+  UnaryFunctor functor;
+  TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data,
+                               strides, input_data);
+  for (int i = 0; i < total_size; ++i) {
+    VERIFY_IS_EQUAL(output_data[i], functor(input_data[i]));
+  }
+
+  delete[] input_data;
+  delete[] output_data;
+}
+
+template <int Layout>
+static void test_block_cwise_unary_io_zero_strides() {
+  typedef internal::scalar_square_op<float> UnaryFunctor;
+  typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, float, 5,
+                                            Layout>
+      TensorBlockCwiseUnaryIO;
+
+  DSizes<Index, 5> rnd_dims = RandomDims<5>();
+
+  DSizes<Index, 5> input_sizes = rnd_dims;
+  input_sizes[0] = 1;
+  input_sizes[2] = 1;
+  input_sizes[4] = 1;
+
+  DSizes<Index, 5> input_strides(ComputeStrides<Layout, 5>(input_sizes));
+  input_strides[0] = 0;
+  input_strides[2] = 0;
+  input_strides[4] = 0;
+
+  // Generate random data.
+  auto* input_data = GenerateRandomData<float>(input_sizes.TotalSize());
+
+  DSizes<Index, 5> output_sizes = rnd_dims;
+  DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));
+
+  const auto output_total_size = output_sizes.TotalSize();
+  auto* output_data = new float[output_total_size];
+
+  UnaryFunctor functor;
+  TensorBlockCwiseUnaryIO::Run(functor, output_sizes, output_strides,
+                               output_data, input_strides, input_data);
+  for (int i = 0; i < rnd_dims[0]; ++i) {
+    for (int j = 0; j < rnd_dims[1]; ++j) {
+      for (int k = 0; k < rnd_dims[2]; ++k) {
+        for (int l = 0; l < rnd_dims[3]; ++l) {
+          for (int m = 0; m < rnd_dims[4]; ++m) {
+            Index output_index = i * output_strides[0] + j * output_strides[1] +
+                                 k * output_strides[2] + l * output_strides[3] +
+                                 m * output_strides[4];
+            Index input_index = i * input_strides[0] + j * input_strides[1] +
+                                k * input_strides[2] + l * input_strides[3] +
+                                m * input_strides[4];
+            VERIFY_IS_EQUAL(output_data[output_index],
+                            functor(input_data[input_index]));
+          }
+        }
+      }
+    }
+  }
+
+  delete[] input_data;
+  delete[] output_data;
+}
+
 template <typename T, int NumDims, int Layout>
 static void test_block_cwise_binary_io_basic() {
   typedef internal::scalar_sum_op<T> BinaryFunctor;
@@ -982,6 +1090,9 @@ EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
   TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_using_reordered_dimensions);
   TEST_LAYOUTS(test_block_io_zero_stride);
   TEST_LAYOUTS(test_block_io_squeeze_ones);
+  TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_unary_io_basic);
+  TEST_LAYOUTS(test_block_cwise_unary_io_squeeze_ones);
+  TEST_LAYOUTS(test_block_cwise_unary_io_zero_strides);
   TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_binary_io_basic);
   TEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones);
   TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides);
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index d16ae4d11..1dc18220c 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -29,6 +29,33 @@ static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
   return dims;
 };
 
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          bool Tileable, int Layout>
+static void test_execute_unary_expr(Device d) {
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> src(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  src.setRandom();
+  const auto expr = src.square();
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T square = src.coeff(i) * src.coeff(i);
+    VERIFY_IS_EQUAL(square, dst.coeff(i));
+  }
+}
+
 template <typename T, int NumDims, typename Device, bool Vectorizable,
           bool Tileable, int Layout>
 static void test_execute_binary_expr(Device d)
@@ -445,6 +472,10 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
   Eigen::ThreadPool tp(num_threads);
   Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
 
+  CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 3);
+  CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 4);
+  CALL_SUBTEST_COMBINATIONS(test_execute_unary_expr, float, 5);
+
   CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 3);
   CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 4);
   CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr, float, 5);
-- 
cgit v1.2.3


From c64fe9ea1f9f5943864cd9ca27d3fcca07453f82 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Thu, 30 Aug 2018 20:22:16 +0000
Subject: Updates to fix HIP-clang specific compile errors.

Compiling the eigen unittests with hip-clang (HIP with clang as the underlying compiler instead of hcc or nvcc), results in compile errors. The changes in this commit fix those compile errors. The main change is to convert a few instances of "__device__" to "EIGEN_DEVICE_FUNC"
---
 Eigen/src/Core/arch/GPU/PacketMathHalf.h           | 74 +++++++++++-----------
 test/gpu_common.h                                  |  6 +-
 .../Eigen/CXX11/src/Tensor/TensorReduction.h       |  2 +-
 3 files changed, 41 insertions(+), 41 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h')

diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
index b0a72e1f9..c4feda87d 100644
--- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
@@ -43,7 +43,7 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
 
 template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
 
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 
@@ -58,29 +58,29 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half&
 #endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
   return *reinterpret_cast<const half2*>(from);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
   return __halves2half2(from[0], from[1]);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
   return __halves2half2(from[0], from[0]);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
   *reinterpret_cast<half2*>(to) = from;
 }
 
-template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
   to[0] = __low2half(from);
   to[1] = __high2half(from);
 }
 
 template<>
- __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
 
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 
@@ -102,7 +102,7 @@ template<>
 }
 
 template<>
-__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
 
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 
@@ -123,20 +123,20 @@ __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::ha
 #endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
   return __halves2half2(from[0*stride], from[1*stride]);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
   to[stride*0] = __low2half(from);
   to[stride*1] = __high2half(from);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
   return __low2half(a);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
   half2 result;
   unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
   *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
@@ -144,7 +144,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
 }
 
 
-__device__ EIGEN_STRONG_INLINE void
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<half2,2>& kernel) {
   __half a1 = __low2half(kernel.packet[0]);
   __half a2 = __high2half(kernel.packet[0]);
@@ -154,7 +154,7 @@ ptranspose(PacketBlock<half2,2>& kernel) {
   kernel.packet[1] = __halves2half2(a2, b2);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
   
   return __halves2half2(a, __hadd(a, __float2half(1.0f)));
@@ -171,7 +171,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half&
 #endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 
   return __hadd2(a, b);
@@ -193,7 +193,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
 #endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 
   return __hsub2(a, b);
@@ -215,7 +215,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
 #endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 
   return __hneg2(a);
@@ -233,9 +233,9 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
 #endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 
   return __hmul2(a, b);
@@ -257,7 +257,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
 #endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 
    return __hfma2(a, b, c);
@@ -281,7 +281,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, con
 #endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
   
 #if defined(EIGEN_HAS_OLD_HIP_FP16)
@@ -303,7 +303,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, cons
 #endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
@@ -313,7 +313,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, cons
   return __halves2half2(r1, r2);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
@@ -323,7 +323,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
   return __halves2half2(r1, r2);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 
   return __hadd(__low2half(a), __high2half(a));
@@ -341,7 +341,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2&
 #endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 
   __half first = __low2half(a);
@@ -363,7 +363,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
 #endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 
   __half first = __low2half(a);
@@ -385,7 +385,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
 #endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 
   return __hmul(__low2half(a), __high2half(a));
@@ -403,7 +403,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const ha
 #endif
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float r1 = log1pf(a1);
@@ -411,7 +411,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
   return __floats2half2_rn(r1, r2);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float r1 = expm1f(a1);
@@ -422,29 +422,29 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
 #if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
   defined(EIGEN_HIP_DEVICE_COMPILE)
 
-template<>  __device__ EIGEN_STRONG_INLINE
+template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 plog<half2>(const half2& a) {
   return h2log(a);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 pexp<half2>(const half2& a) {
   return h2exp(a);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 psqrt<half2>(const half2& a) {
   return h2sqrt(a);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 prsqrt<half2>(const half2& a) {
   return h2rsqrt(a);
 }
 
 #else
 
-template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float r1 = logf(a1);
@@ -452,7 +452,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
   return __floats2half2_rn(r1, r2);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float r1 = expf(a1);
@@ -460,7 +460,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
   return __floats2half2_rn(r1, r2);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float r1 = sqrtf(a1);
@@ -468,7 +468,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
   return __floats2half2_rn(r1, r2);
 }
 
-template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float r1 = rsqrtf(a1);
diff --git a/test/gpu_common.h b/test/gpu_common.h
index 3aac49e96..79d4ea694 100644
--- a/test/gpu_common.h
+++ b/test/gpu_common.h
@@ -61,9 +61,9 @@ void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out)
   gpuDeviceSynchronize();
   
 #ifdef EIGEN_USE_HIP
-  hipLaunchKernelGGL(run_on_gpu_meta_kernel<Kernel,
-		     typename std::decay<decltype(*d_in)>::type,
-		     typename std::decay<decltype(*d_out)>::type>, 
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(run_on_gpu_meta_kernel<Kernel,
+				     typename std::decay<decltype(*d_in)>::type,
+				     typename std::decay<decltype(*d_out)>::type>), 
 		     dim3(Grids), dim3(Blocks), 0, 0, ker, n, d_in, d_out);
 #else
   run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 01d3863da..f3f1640b0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -14,7 +14,7 @@
 // clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
 // so we'll use a macro to make clang happy.
 #ifndef KERNEL_FRIEND
-#if defined(__clang__) && defined(__CUDA__)
+#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
 #define KERNEL_FRIEND friend __global__
 #else
 #define KERNEL_FRIEND friend
-- 
cgit v1.2.3


From 81b38a155adf5d527bce5c84cf90cd83c28da445 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 11 Sep 2018 13:32:32 -0700
Subject: Fix compilation of tiled evaluation code with c++03

---
 unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h        |  6 +++---
 unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 11 +++++------
 unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h     |  5 ++---
 unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h   |  5 ++---
 unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h     |  5 ++---
 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h    |  5 ++---
 unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h    |  4 ++--
 unsupported/test/cxx11_tensor_block_access.cpp          | 14 +++++++-------
 unsupported/test/cxx11_tensor_shuffling.cpp             | 10 +++++-----
 9 files changed, 30 insertions(+), 35 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index 6d90af2d3..13da36257 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -396,8 +396,8 @@ struct TensorBlockCwiseUnaryOp {
     typedef const Eigen::Array<InputScalar, Dynamic, 1> Input;
     typedef Eigen::Array<OutputScalar, Dynamic, 1> Output;
 
-    typedef Eigen::Map<Input, 0, InnerStride<>> InputMap;
-    typedef Eigen::Map<Output, 0, InnerStride<>> OutputMap;
+    typedef Eigen::Map<Input, 0, InnerStride<> > InputMap;
+    typedef Eigen::Map<Output, 0, InnerStride<> > OutputMap;
 
     const InputScalar* input_base = &input_data[input_index];
     OutputScalar* output_base = &output_data[output_index];
@@ -502,7 +502,7 @@ struct TensorBlockCwiseUnaryIO {
                                    input_stride, input_data);
       // Update index.
       for (int j = 0; j < num_squeezed_dims; ++j) {
-        auto& state = block_iter_state[j];
+        BlockIteratorState& state = block_iter_state[j];
         if (++state.count < state.size) {
           output_index += state.output_stride;
           input_index += state.input_stride;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 02d061a9c..e5cf93ab0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -596,12 +596,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
     // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
     // tensors. But this might need further tuning.
-    auto block_total_size_max = numext::maxi<Eigen::Index>(
+    Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
         1, m_device.firstLevelCacheSize() / sizeof(Scalar));
 
     resources->push_back(internal::TensorOpResourceRequirements(
-        internal::TensorBlockShapeType::kSkewedInnerDims,
-        block_total_size_max));
+        internal::kSkewedInnerDims, block_total_size_max));
 
     m_impl.getResourceRequirements(resources);
   }
@@ -617,8 +616,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     // equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size
     // for the first outer dim, and 1 for other outer dims. This is guaranteed
     // by MergeResourceRequirements() in TensorBlock.h.
-    const auto& output_block_sizes = output_block->block_sizes();
-    const auto& output_block_strides = output_block->block_strides();
+    const Dimensions& output_block_sizes = output_block->block_sizes();
+    const Dimensions& output_block_strides = output_block->block_strides();
 
     // Find where outer dims start.
     int outer_dim_start = 0;
@@ -642,7 +641,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       return;
     }
 
-    const auto& input_dims = m_impl.dimensions();
+    const Dimensions& input_dims = m_impl.dimensions();
 
     // Pre-fill input_block_sizes, broadcast_block_sizes,
     // broadcast_block_strides, and broadcast_tensor_strides. Later on we will
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 76fab39e2..b47fa9e8e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -290,11 +290,10 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    auto block_total_size_max = numext::maxi<Eigen::Index>(
+    Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
         1, m_device.lastLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
-        internal::TensorBlockShapeType::kSkewedInnerDims,
-        block_total_size_max));
+        internal::kSkewedInnerDims, block_total_size_max));
     m_impl.getResourceRequirements(resources);
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 1826d7022..965bd8f1e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -550,11 +550,10 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    auto block_total_size_max = numext::maxi<Eigen::Index>(
+    Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
         1, m_device.lastLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
-        internal::TensorBlockShapeType::kSkewedInnerDims,
-        block_total_size_max));
+        internal::kSkewedInnerDims, block_total_size_max));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 2f765acb7..16dc74afe 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -677,11 +677,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    auto block_total_size_max = numext::maxi<Eigen::Index>(
+    Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
         1, m_device.lastLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
-        internal::TensorBlockShapeType::kSkewedInnerDims,
-        block_total_size_max));
+        internal::kSkewedInnerDims, block_total_size_max));
     m_impl.getResourceRequirements(resources);
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 3d534eaa2..eeb2578fd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -771,11 +771,10 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    auto block_total_size_max = numext::maxi<Eigen::Index>(
+    Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
         1, m_device.lastLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
-        internal::TensorBlockShapeType::kSkewedInnerDims,
-        block_total_size_max));
+        internal::kSkewedInnerDims, block_total_size_max));
     m_impl.getResourceRequirements(resources);
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index a5b541a68..e018d0ab2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -229,10 +229,10 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
       std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    auto block_total_size_max = numext::maxi<Eigen::Index>(
+    Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
         1, m_device.firstLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
-        internal::TensorBlockShapeType::kUniformAllDims, block_total_size_max));
+        internal::kUniformAllDims, block_total_size_max));
     m_impl.getResourceRequirements(resources);
   }
 
diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp
index eec282ba7..ad12ae557 100644
--- a/unsupported/test/cxx11_tensor_block_access.cpp
+++ b/unsupported/test/cxx11_tensor_block_access.cpp
@@ -535,7 +535,7 @@ static void test_block_cwise_unary_io_basic() {
   DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>();
   DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes));
 
-  const auto total_size = block_sizes.TotalSize();
+  const Index total_size = block_sizes.TotalSize();
 
   // Create a random input tensors.
   T* input_data = GenerateRandomData<T>(total_size);
@@ -562,12 +562,12 @@ static void test_block_cwise_unary_io_squeeze_ones() {
   DSizes<Index, 5> block_sizes(1, 2, 1, 3, 1);
   DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
 
-  const auto total_size = block_sizes.TotalSize();
+  const Index total_size = block_sizes.TotalSize();
 
   // Create a random input tensors.
-  auto* input_data = GenerateRandomData<float>(total_size);
+  float* input_data = GenerateRandomData<float>(total_size);
 
-  auto* output_data = new float[total_size];
+  float* output_data = new float[total_size];
   UnaryFunctor functor;
   TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data,
                                strides, input_data);
@@ -599,13 +599,13 @@ static void test_block_cwise_unary_io_zero_strides() {
   input_strides[4] = 0;
 
   // Generate random data.
-  auto* input_data = GenerateRandomData<float>(input_sizes.TotalSize());
+  float* input_data = GenerateRandomData<float>(input_sizes.TotalSize());
 
   DSizes<Index, 5> output_sizes = rnd_dims;
   DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));
 
-  const auto output_total_size = output_sizes.TotalSize();
-  auto* output_data = new float[output_total_size];
+  const Index output_total_size = output_sizes.TotalSize();
+  float* output_data = new float[output_total_size];
 
   UnaryFunctor functor;
   TensorBlockCwiseUnaryIO::Run(functor, output_sizes, output_strides,
diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp
index 467df39c7..062dd1c0f 100644
--- a/unsupported/test/cxx11_tensor_shuffling.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling.cpp
@@ -81,12 +81,12 @@ static void test_expr_shuffling()
   Tensor<float, 4, DataLayout> expected;
   expected = tensor.shuffle(shuffles);
 
-  Tensor<float, 4, DataLayout> result(5,7,3,2);
+  Tensor<float, 4, DataLayout> result(5, 7, 3, 2);
 
-  array<ptrdiff_t, 4> src_slice_dim{{2,3,1,7}};
-  array<ptrdiff_t, 4> src_slice_start{{0,0,0,0}};
-  array<ptrdiff_t, 4> dst_slice_dim{{1,7,3,2}};
-  array<ptrdiff_t, 4> dst_slice_start{{0,0,0,0}};
+  array<ptrdiff_t, 4> src_slice_dim({2, 3, 1, 7});
+  array<ptrdiff_t, 4> src_slice_start({0, 0, 0, 0});
+  array<ptrdiff_t, 4> dst_slice_dim({1, 7, 3, 2});
+  array<ptrdiff_t, 4> dst_slice_start({0, 0, 0, 0});
 
   for (int i = 0; i < 5; ++i) {
     result.slice(dst_slice_start, dst_slice_dim) =
-- 
cgit v1.2.3


From 01197e44527941c95f9a63e4f60ab3a989f12cbe Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 13 Sep 2018 15:03:36 -0700
Subject: Fix warnings

---
 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 --
 unsupported/test/cxx11_tensor_executor.cpp           | 8 ++++----
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index eeb2578fd..261dbffa8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -1140,7 +1140,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
       const Index max_coeff_count,
       const DSizes<Index, NumInputDims>& input_slice_sizes,
       DSizes<Index, NumInputDims>* target_input_block_sizes) const {
-    typedef typename internal::packet_traits<Scalar>::type Packet;
     typedef internal::BlockReducer<Self, Op> BlockReducer;
     // TODO(andydavis) Compute reducer overhead correctly for the case where
     // we are preserving the inner most dimension, and a single reducer
@@ -1168,7 +1167,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
       } else if (!first_preserved_dim_allocated) {
         // TODO(andydavis) Include output block size in this L1 working set
         // calculation.
-        const Index allocated = max_coeff_count - coeff_to_allocate;
         const Index alloc_size = numext::maxi(
             static_cast<Index>(1), coeff_to_allocate / reducer_overhead);
         (*target_input_block_sizes)[dim] =
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index 8639e7b38..1bb99854c 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -27,7 +27,7 @@ static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
     dims[i] = internal::random<int>(min_dim, max_dim);
   }
   return dims;
-};
+}
 
 template <typename T, int NumDims, typename Device, bool Vectorizable,
           bool Tileable, int Layout>
@@ -117,7 +117,7 @@ static void test_execute_broadcasting(Device d)
   for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
     VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
   }
-};
+}
 
 template <typename T, int NumDims, typename Device, bool Vectorizable,
           bool Tileable, int Layout>
@@ -155,7 +155,7 @@ static void test_execute_chipping_rvalue(Device d) {
   TEST_CHIPPING(5)
 
 #undef TEST_CHIPPING
-};
+}
 
 template <typename T, int NumDims, typename Device, bool Vectorizable,
     bool Tileable, int Layout>
@@ -207,7 +207,7 @@ static void test_execute_chipping_lvalue(Device d) {
   TEST_CHIPPING(5)
 
 #undef TEST_CHIPPING
-};
+}
 
 template <typename T, int NumDims, typename Device, bool Vectorizable,
           bool Tileable, int Layout>
-- 
cgit v1.2.3