From 1c879eb010df8e53e5ac016ee5d155db2c721c2b Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 10 Dec 2019 15:40:23 -0800
Subject: Remove V2 suffix from TensorBlock

---
 unsupported/Eigen/CXX11/Tensor                     |    2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h  |    8 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h  |   16 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h   | 1481 ++++++++++++++++++++
 unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h | 1481 --------------------
 .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h    |   50 +-
 .../Eigen/CXX11/src/Tensor/TensorChipping.h        |   34 +-
 .../Eigen/CXX11/src/Tensor/TensorConcatenation.h   |    8 +-
 .../Eigen/CXX11/src/Tensor/TensorContraction.h     |    4 +-
 .../Eigen/CXX11/src/Tensor/TensorConversion.h      |   14 +-
 .../Eigen/CXX11/src/Tensor/TensorConvolution.h     |    8 +-
 .../Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h |    4 +-
 .../Eigen/CXX11/src/Tensor/TensorCustomOp.h        |    8 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h  |   10 +-
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h       |  110 +-
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h        |   16 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h     |    4 +-
 .../Eigen/CXX11/src/Tensor/TensorFixedSize.h       |    4 +-
 .../Eigen/CXX11/src/Tensor/TensorForcedEval.h      |   14 +-
 .../CXX11/src/Tensor/TensorForwardDeclarations.h   |    8 +-
 .../Eigen/CXX11/src/Tensor/TensorGenerator.h       |   16 +-
 .../Eigen/CXX11/src/Tensor/TensorImagePatch.h      |    4 +-
 .../Eigen/CXX11/src/Tensor/TensorInflation.h       |    4 +-
 .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h      |    8 +-
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h        |   58 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h |   20 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h   |    4 +-
 .../Eigen/CXX11/src/Tensor/TensorReduction.h       |    4 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorRef.h     |   12 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h |   22 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorScan.h    |    4 +-
 .../Eigen/CXX11/src/Tensor/TensorShuffling.h       |   32 +-
 .../Eigen/CXX11/src/Tensor/TensorStriding.h        |    4 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h   |    4 +-
 .../Eigen/CXX11/src/Tensor/TensorVolumePatch.h     |    4 +-
 unsupported/test/cxx11_tensor_block_access.cpp     |   68 +-
 unsupported/test/cxx11_tensor_block_eval.cpp       |    8 +-
 unsupported/test/cxx11_tensor_block_io.cpp         |   26 +-
 38 files changed, 1793 insertions(+), 1793 deletions(-)
 create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
 delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 10786048e..2640f9565 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -97,7 +97,7 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorGlobalFunctions.h"
 
 #include "src/Tensor/TensorBase.h"
-#include "src/Tensor/TensorBlockV2.h"
+#include "src/Tensor/TensorBlock.h"
 
 #include "src/Tensor/TensorEvaluator.h"
 #include "src/Tensor/TensorExpr.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index 68bfd141a..91a6f8d6c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -88,7 +88,7 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
     PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
@@ -96,7 +96,7 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -229,7 +229,7 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
   enum {
     IsAligned         = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
     PacketAccess      = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
-    BlockAccessV2     = false,
+    BlockAccess       = false,
     PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout            = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
     CoordAccess       = false,  // to be implemented
@@ -237,7 +237,7 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 22d672aa4..72f072cf2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -108,8 +108,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
                         TensorEvaluator<RightArgType, Device>::IsAligned,
     PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &
                         TensorEvaluator<RightArgType, Device>::PacketAccess,
-    BlockAccessV2     = TensorEvaluator<LeftArgType, Device>::BlockAccessV2 &
-                        TensorEvaluator<RightArgType, Device>::BlockAccessV2,
+    BlockAccess       = TensorEvaluator<LeftArgType, Device>::BlockAccess &
+                        TensorEvaluator<RightArgType, Device>::BlockAccess,
     PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
                         TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
     Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
@@ -120,7 +120,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
   typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
   typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
 
-  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlockV2
+  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock
       RightTensorBlock;
   //===--------------------------------------------------------------------===//
 
@@ -201,13 +201,13 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockV2ResourceRequirements getResourceRequirements() const {
-    return internal::TensorBlockV2ResourceRequirements::merge(
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::merge(
         m_leftImpl.getResourceRequirements(),
         m_rightImpl.getResourceRequirements());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2(
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(
       TensorBlockDesc& desc, TensorBlockScratch& scratch) {
     if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
         m_leftImpl.data() != NULL) {
@@ -218,10 +218,10 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
           /*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()));
     }
 
-    RightTensorBlock block = m_rightImpl.blockV2(desc, scratch, /*root_of_expr_ast=*/true);
+    RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true);
     // If block was evaluated into a destination, there is no need to do assignment.
     if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
-      m_leftImpl.writeBlockV2(desc, block);
+      m_leftImpl.writeBlock(desc, block);
     }
     block.cleanup();
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
new file mode 100644
index 000000000..222333847
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -0,0 +1,1481 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+
+namespace Eigen {
+namespace internal {
+
+// -------------------------------------------------------------------------- //
+// Forward declarations for templates defined below.
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIO;
+
+// -------------------------------------------------------------------------- //
+// Helper function to compute strides for densely stored buffer of given
+// dimensions.
+
+// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
+// this function instead everywhere.
+template <int Layout, typename IndexType, int NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
+    const DSizes<IndexType, NumDims>& dimensions) {
+  DSizes<IndexType, NumDims> strides;
+  if (NumDims == 0) return strides;
+
+  // TODO(ezhulenev): Use templates to unroll this loop (similar to
+  // h_array_reduce in CXX11meta.h)? Benchmark it.
+  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+    strides[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      strides[i] = strides[i - 1] * dimensions[i - 1];
+    }
+  } else {
+    strides[NumDims - 1] = 1;
+    for (int i = NumDims - 2; i >= 0; --i) {
+      strides[i] = strides[i + 1] * dimensions[i + 1];
+    }
+  }
+
+  return strides;
+}
+
+template <int Layout, typename IndexType, size_t NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
+    const Eigen::array<IndexType, NumDims>& dimensions) {
+  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
+}
+
+template <int Layout, std::ptrdiff_t... Indices>
+EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
+    const Sizes<Indices...>& sizes) {
+  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
+}
+
+// -------------------------------------------------------------------------- //
+
+// Tensor block shape type defines what are the shape preference for the blocks
+// extracted from the larger tensor.
+//
+// Example: blocks of 100 elements from the large 100x100 tensor:
+// - tensor: 100x100
+// - target_block_size: 100
+//
+// TensorBlockShapeType:
+//  - kUniformAllDims: 100 blocks of size 10x10
+//  - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
+//                      or row major layout)
+enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
+
+struct TensorBlockResourceRequirements {
+  TensorBlockShapeType shape_type;
+  size_t size;
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
+  merge(const TensorBlockResourceRequirements &lhs,
+        const TensorBlockResourceRequirements &rhs) {
+    return {merge(lhs.shape_type, rhs.shape_type), merge(rhs.size, lhs.size)};
+  }
+
+  // This is a resource requirement that should be returned from expressions
+  // that do not have any block evaluation preference (e.g. default tensor
+  // expression with raw buffer access).
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
+    return {TensorBlockShapeType::kUniformAllDims, 1};
+  }
+
+private:
+  using Requirements = TensorBlockResourceRequirements;
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
+    return numext::maxi(lhs_size, rhs_size);
+  }
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs,
+							  TensorBlockShapeType rhs) {
+    return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
+            rhs == TensorBlockShapeType::kSkewedInnerDims)
+               ? TensorBlockShapeType::kSkewedInnerDims
+               : TensorBlockShapeType::kUniformAllDims;
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockDescriptor specifies a block offset within a tensor and the block
+// sizes along each of the tensor dimensions.
+
+template <int NumDims, typename IndexType = Eigen::Index>
+class TensorBlockDescriptor {
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+
+  // If we evaluate a Tensor assignment, and expression on the left, already has
+  // a memory buffer, then we might do performance optimization, and evaluate
+  // the root expression directly into the final output memory. Some time it's
+  // possible to reuse it for materializing subexpressions inside an expression
+  // tree, to to avoid dynamic memory allocation.
+  //
+  // The pointer type of the underlying storage is erased, because passing
+  // Scalar type through all the expression evaluation layers is way too many
+  // templates. In practice destination buffer type should always match the
+  // evaluated expression scalar type.
+  class DestinationBuffer {
+   public:
+    enum DestinationBufferKind : int {
+      // The above explicit specification of "int" as the enum basetype is needed
+      // to get around a HIPCC link error ("the field type is not amp-compatible")
+      // which is issued for class members with the enum type.
+      // TODO(rocm):
+      // remove the "int" basetype once HIPCC has been fixed to not error out
+      // in the above scenario.
+
+      // Destination buffer is not defined (`m_data` == nullptr).
+      kEmpty,
+
+      // Tensor block defined by an owning tensor block descriptor can fit
+      // contiguously into the destination buffer. In this case it's safe to
+      // materialize tensor block in the destination buffer, wrap it in a
+      // TensorMap, and use to build Eigen expression on top of it.
+      kContiguous,
+
+      // Destination buffer strides do not match strides of the contiguously
+      // stored block, and it's impossible to define a TensorMap over this
+      // buffer. However if we are evaluating a root of an expression tree, we
+      // still can materialize an output into this destination, because we can
+      // guarantee that no one will ever access it through block API.
+      //
+      // In theory it is possible to build valid TensorStriding<TensorMap>
+      // expression on top of this destination buffer, however it has
+      // inefficient coeff/packet access, and defeats the purpose of fast block
+      // evaluation API.
+      kStrided
+    };
+
+    template <typename Scalar>
+    Scalar* data() const {
+      eigen_assert(m_data_type_size == sizeof(Scalar));
+      return static_cast<Scalar*>(m_data);
+    }
+
+    const Dimensions& strides() const { return m_strides; }
+    const DestinationBufferKind& kind() const { return m_kind; }
+
+   private:
+    friend class TensorBlockDescriptor;
+
+    DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
+
+    template <typename Scalar>
+    DestinationBuffer(Scalar* data, const Dimensions& strides,
+                      DestinationBufferKind kind)
+        : m_data(static_cast<void*>(data)),
+          m_data_type_size(sizeof(Scalar)),
+          m_strides(strides),
+          m_kind(kind) {}
+
+    template <int Layout, typename Scalar>
+    static DestinationBuffer make(const TensorBlockDescriptor& desc,
+                                  Scalar* data, const Dimensions& strides) {
+      return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
+    }
+
+    template <int Layout>
+    static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
+                                      const Dimensions& strides) {
+      const Dimensions& desc_dims = desc.dimensions();
+      const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
+      for (int i = 0; i < NumDims; ++i) {
+        if (desc_dims[i] == 1) continue;
+        if (desc_strides[i] != strides[i]) return kStrided;
+      }
+      return kContiguous;
+    }
+
+    // Storage pointer is type erased, to reduce template bloat, but we still
+    // keep the size of the underlying element type for error checking.
+    void* m_data;
+    size_t m_data_type_size;
+
+    // Destination buffer dimensions always match the dimensions of a tensor
+    // block descriptor it belongs to, however strides might be different.
+    Dimensions m_strides;
+
+    DestinationBufferKind m_kind;
+  };
+
+  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
+                        const DestinationBuffer& destination)
+      : m_offset(offset),
+        m_dimensions(dimensions),
+        m_destination(destination) {}
+
+  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
+      : m_offset(offset),
+        m_dimensions(dimensions),
+        m_destination(DestinationBuffer()) {}
+
+  IndexType offset() const { return m_offset; }
+  const Dimensions& dimensions() const { return m_dimensions; }
+  IndexType dimension(int index) const { return m_dimensions[index]; }
+  IndexType size() const { return array_prod<IndexType>(m_dimensions); }
+
+  const DestinationBuffer& destination() const { return m_destination; }
+
+  template <int Layout, typename Scalar>
+  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
+    eigen_assert(dst_base != NULL);
+    m_destination =
+        DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
+  }
+
+  template <int Layout, typename Scalar, typename DstStridesIndexType>
+  void AddDestinationBuffer(
+      Scalar* dst_base,
+      const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
+    // DSizes constructor will do index type promotion if it's safe.
+    AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
+  }
+
+  TensorBlockDescriptor& DropDestinationBuffer() {
+    m_destination.m_data = NULL;
+    m_destination.m_kind = DestinationBuffer::kEmpty;
+    return *this;
+  }
+
+  bool HasDestinationBuffer() const {
+    return m_destination.kind() != DestinationBuffer::kEmpty;
+  }
+
+  // Returns a copy of `*this` with updated offset.
+  TensorBlockDescriptor WithOffset(IndexType offset) const {
+    return TensorBlockDescriptor(offset, m_dimensions, m_destination);
+  }
+
+ private:
+  // Offset and dimensions are immutable after construction. Block descriptor
+  // can only be mutated by adding or dropping destination.
+  const IndexType m_offset;
+  const Dimensions m_dimensions;
+  DestinationBuffer m_destination;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
+
+template <int NumDims, int Layout, typename IndexType = Eigen::Index>
+class TensorBlockMapper {
+  typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
+
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+
+  TensorBlockMapper() = default;
+  TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
+                      const TensorBlockResourceRequirements& requirements)
+      : m_tensor_dimensions(dimensions), m_requirements(requirements) {
+    // Initialize `m_block_dimensions`.
+    InitializeBlockDimensions();
+
+    // Calculate block counts by dimension and total block count.
+    DSizes<IndexType, NumDims> block_count;
+    for (int i = 0; i < NumDims; ++i) {
+      block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
+    }
+    m_total_block_count = array_prod(block_count);
+
+    // Calculate block strides (used for enumerating blocks).
+    m_tensor_strides = strides<Layout>(m_tensor_dimensions);
+    m_block_strides = strides<Layout>(block_count);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
+    return m_total_block_count;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
+    return m_block_dimensions.TotalSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
+  blockDimensions() const {
+    return m_block_dimensions;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  BlockDescriptor blockDescriptor(IndexType block_index) const {
+    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
+
+    IndexType offset = 0;
+    DSizes<IndexType, NumDims> dimensions;
+
+    if (NumDims == 0) return BlockDescriptor(offset, dimensions);
+
+    // Iterate outer -> inner dimensions.
+    for (int i = NumDims - 1; i >= 0; --i) {
+      const int dim = isColMajor ? i : NumDims - i - 1;
+
+      const IndexType idx = block_index / m_block_strides[dim];
+      block_index -= idx * m_block_strides[dim];
+
+      const IndexType coord = idx * m_block_dimensions[dim];
+      dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
+                                     m_block_dimensions[dim]);
+      offset += coord * m_tensor_strides[dim];
+    }
+
+    return {offset, dimensions};
+  }
+
+ private:
+  void InitializeBlockDimensions() {
+    // Requested block shape and size.
+    const TensorBlockShapeType shape_type = m_requirements.shape_type;
+    const IndexType target_block_size =
+        numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
+
+    // Corner case: one of the dimensions is zero. Logic below is too complex
+    // to handle this case on a general basis, just use unit block size.
+    // Note: we must not yield blocks with zero dimensions (recipe for
+    // overflows/underflows, divisions by zero and NaNs later).
+    if (m_tensor_dimensions.TotalSize() == 0) {
+      for (int i = 0; i < NumDims; ++i) {
+        m_block_dimensions[i] = 1;
+      }
+      return;
+    }
+
+    // If tensor fits into a target block size, evaluate it as a single block.
+    if (m_tensor_dimensions.TotalSize() <= target_block_size) {
+      m_block_dimensions = m_tensor_dimensions;
+      return;
+    }
+
+    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
+
+    // Block shape skewed towards inner dimension.
+    if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
+      IndexType coeff_to_allocate = target_block_size;
+
+      for (int i = 0; i < NumDims; ++i) {
+        const int dim = isColMajor ? i : NumDims - i - 1;
+        m_block_dimensions[dim] =
+            numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
+        coeff_to_allocate = divup(
+            coeff_to_allocate,
+            numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
+      }
+      eigen_assert(coeff_to_allocate == 1);
+
+    } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
+      // Tensor will not fit within 'target_block_size' budget: calculate tensor
+      // block dimension sizes based on "square" dimension size target.
+      const IndexType dim_size_target = convert_index<IndexType>(
+          std::pow(static_cast<float>(target_block_size),
+                   1.0f / static_cast<float>(m_block_dimensions.rank())));
+
+      for (int i = 0; i < NumDims; ++i) {
+        // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
+        // a multiple of the packet size. Note that reducing
+        // 'block_dim_size' in this manner can increase the number of
+        // blocks, and so will amplify any per-block overhead.
+        m_block_dimensions[i] =
+            numext::mini(dim_size_target, m_tensor_dimensions[i]);
+      }
+
+      // Add any un-allocated coefficients to inner dimension(s).
+      IndexType total_size = m_block_dimensions.TotalSize();
+      for (int i = 0; i < NumDims; ++i) {
+        const int dim = isColMajor ? i : NumDims - i - 1;
+
+        if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
+          const IndexType total_size_other_dims =
+              total_size / m_block_dimensions[dim];
+          const IndexType alloc_avail =
+              divup<IndexType>(target_block_size, total_size_other_dims);
+          if (alloc_avail == m_block_dimensions[dim]) {
+            // Insufficient excess coefficients to allocate.
+            break;
+          }
+          m_block_dimensions[dim] =
+              numext::mini(m_tensor_dimensions[dim], alloc_avail);
+          total_size = total_size_other_dims * m_block_dimensions[dim];
+        }
+      }
+
+    } else {
+      eigen_assert(false);  // unknown block shape
+    }
+
+    eigen_assert(m_block_dimensions.TotalSize() >=
+                 numext::mini<IndexType>(target_block_size,
+                                     m_tensor_dimensions.TotalSize()));
+  }
+
+  DSizes<IndexType, NumDims> m_tensor_dimensions;
+  TensorBlockResourceRequirements m_requirements;
+
+  DSizes<IndexType, NumDims> m_block_dimensions;
+  IndexType m_total_block_count;
+
+  DSizes<IndexType, NumDims> m_tensor_strides;
+  DSizes<IndexType, NumDims> m_block_strides;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockScratchAllocator is responsible for allocating temporary buffers
+// for block evaluation (output or input block materialization). Given that
+// Eigen expression traversal order is deterministic, all temporary allocations
+// are happening in the same order, and usually have exactly the same size.
+// Scratch allocator keeps a trace of all dynamic allocations, and after the
+// first block evaluation is completed, we should be able to reuse all the
+// temporary buffers for the next block evaluation.
+
+template <typename Device>
+class TensorBlockScratchAllocator {
+ public:
+  explicit TensorBlockScratchAllocator(const Device& device)
+      : m_device(device), m_allocation_index(0) {}
+
+  ~TensorBlockScratchAllocator() {
+    for (size_t i = 0; i < m_allocations.size(); ++i) {
+      m_device.deallocate(m_allocations[i].ptr);
+    }
+  }
+
+  void* allocate(size_t size) {
+    // TODO(ezhulenev): Remove when replaced with inlined vector.
+    if (m_allocations.capacity() == 0) m_allocations.reserve(8);
+
+    // Check if we already have an existing allocation att current index.
+    const int num_allocations = static_cast<int>(m_allocations.size());
+    const bool has_allocation = m_allocation_index < num_allocations;
+
+    // Allocation index can't be larger than the number of allocations.
+    eigen_assert(m_allocation_index <= num_allocations);
+
+    // If we have existing allocation, and its size is larger or equal to
+    // requested size, we do nothing.
+
+    // If current allocation can't fit requested size, we deallocate it, and
+    // replace with a larger allocation.
+    if (has_allocation && m_allocations[m_allocation_index].size < size) {
+      m_device.deallocate(m_allocations[m_allocation_index].ptr);
+      m_allocations[m_allocation_index].ptr = m_device.allocate(size);
+      m_allocations[m_allocation_index].size = size;
+    }
+
+    // Make a new allocation if we don't have and existing one.
+    if (!has_allocation) {
+      Allocation allocation;
+      allocation.ptr = m_device.allocate(size);
+      allocation.size = size;
+      m_allocations.push_back(allocation);
+    }
+
+    eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
+    eigen_assert(m_allocations[m_allocation_index].size >= size);
+
+    return m_allocations[m_allocation_index++].ptr;
+  }
+
+  void reset() { m_allocation_index = 0; }
+
+ private:
+  struct Allocation {
+    void* ptr;
+    size_t size;
+  };
+
+  const Device& m_device;
+  int m_allocation_index;
+  // TODO(ezhulenev): This should be an inlined vector.
+  std::vector<Allocation> m_allocations;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockKind represents all possible block kinds, that can be produced by
+// TensorEvaluator::evalBlock function.
+enum TensorBlockKind {
+  // Tensor block that is a lazy expression that must be assigned to a
+  // destination using TensorBlockAssign.
+  kExpr,
+
+  // Tensor block that is a view into a memory buffer owned by an underlying
+  // Tensor expression (e.g. it can be a view into a Tensor buffer).
+  kView,
+
+  // Tensor block that was materialized in a scratch memory buffer, allocated
+  // with TensorBlockScratchAllocator. This block must be copied to a
+  // destination, similar to a block of `kExpr` type.
+  kMaterializedInScratch,
+
+  // Tensor block that was materialized directly into the final output memory
+  // buffer. For example if the left side of an assignment is a Tensor, we can
+  // directly materialize the block in the destination memory.
+  //
+  // If strides in the output buffer do not match tensor block strides, the
+  // Tensor expression will be invalid, and should not be used by
+  // TensorBlockAssign or for constructing another block expression.
+  kMaterializedInOutput
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
+// TensorEvaluators that do not support block evaluation.
+
+class TensorBlockNotImplemented {
+ public:
+  typedef void XprType;
+};
+
+// -------------------------------------------------------------------------- //
+// XprScalar extracts Scalar type from the Eigen expressions (if expression type
+// is not void). It's required to be able to define lazy block expression for
+// argument types, that do not support block evaluation.
+
+template <typename XprType>
+struct XprScalar {
+  typedef typename XprType::Scalar type;
+};
+template <>
+struct XprScalar<void> {
+  typedef void type;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorMaterializedBlock is a fully evaluated block of the original tensor,
+// and XprType is just a TensorMap over the data. This block type is typically
+// used to materialize blocks of tensor expressions, that can't be efficiently
+// represented as lazy Tensor expressions with fast coeff/packet operations,
+// e.g. we materialize all broadcasts into evaluated blocks.
+//
+// TensorMaterializedBlock does not own its memory buffer, it's either a memory
+// buffer that backs the original expression (e.g. block is just a view into a
+// Tensor), or a memory buffer allocated with scratch allocator, and in this
+// case the scratch allocator will deallocate it at the end of block based
+// expression execution.
+//
+// If the block was evaluated directly into the output buffer, and strides in
+// the output buffer do not match block strides, the TensorMap expression will
+// be invalid, and should never be used in block assignment or any other tensor
+// expression.
+
+template <typename Scalar, int NumDims, int Layout,
+          typename IndexType = Eigen::Index>
+class TensorMaterializedBlock {
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+  typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
+
+  TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
+                          const Dimensions& dimensions, bool valid_expr = true)
+      : m_kind(kind),
+        m_data(data),
+        m_dimensions(dimensions),
+        m_expr(m_data, m_dimensions),
+        m_valid_expr(valid_expr) {
+    eigen_assert(m_kind == internal::TensorBlockKind::kView ||
+                 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
+                 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
+  }
+
+  TensorBlockKind kind() const { return m_kind; }
+  // NOTE(ezhulenev): Returning XprType by value like in other block types
+  // causes asan failures. The theory is that XprType::Nested doesn't work
+  // properly for TensorMap.
+  const XprType& expr() const {
+    eigen_assert(m_valid_expr);
+    return m_expr;
+  }
+  const Scalar* data() const { return m_data; }
+  void cleanup() {}
+
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+
+  // TensorMaterializedBlock can be backed by different types of storage:
+  //
+  //   (1) Contiguous block of memory allocated with scratch allocator.
+  //   (2) Contiguous block of memory reused from tensor block descriptor
+  //       destination buffer.
+  //   (3) Strided block of memory reused from tensor block descriptor
+  //       destination buffer.
+  //
+  class Storage {
+   public:
+    Scalar* data() const { return m_data; }
+    const Dimensions& dimensions() const { return m_dimensions; }
+    const Dimensions& strides() const { return m_strides; }
+
+    TensorMaterializedBlock AsTensorMaterializedBlock() const {
+      return TensorMaterializedBlock(
+          m_materialized_in_output
+              ? internal::TensorBlockKind::kMaterializedInOutput
+              : internal::TensorBlockKind::kMaterializedInScratch,
+          m_data, m_dimensions, !m_strided_storage);
+    }
+
+   private:
+    friend class TensorMaterializedBlock;
+
+    Storage(Scalar* data, const Dimensions& dimensions,
+            const Dimensions& strides, bool materialized_in_output,
+            bool strided_storage)
+        : m_data(data),
+          m_dimensions(dimensions),
+          m_strides(strides),
+          m_materialized_in_output(materialized_in_output),
+          m_strided_storage(strided_storage) {}
+
+    Scalar* m_data;
+    Dimensions m_dimensions;
+    Dimensions m_strides;
+    bool m_materialized_in_output;
+    bool m_strided_storage;
+  };
+
+  // Creates a storage for materialized block either from the block descriptor
+  // destination buffer, or allocates a new buffer with scratch allocator.
+  template <typename TensorBlockScratch>
+  EIGEN_STRONG_INLINE static Storage prepareStorage(
+      TensorBlockDesc& desc, TensorBlockScratch& scratch,
+      bool allow_strided_storage = false) {
+    // Try to reuse destination as an output block buffer.
+    typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
+
+    if (desc.destination().kind() == DestinationBuffer::kContiguous) {
+      Scalar* buffer = desc.destination().template data<Scalar>();
+      desc.DropDestinationBuffer();
+      return Storage(buffer, desc.dimensions(),
+                     internal::strides<Layout>(desc.dimensions()),
+                     /*materialized_in_output=*/true,
+                     /*strided_storage=*/false);
+
+    } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
+               allow_strided_storage) {
+      Scalar* buffer = desc.destination().template data<Scalar>();
+      desc.DropDestinationBuffer();
+      return Storage(buffer, desc.dimensions(), desc.destination().strides(),
+                     /*materialized_in_output=*/true, /*strided_storage=*/true);
+
+    } else {
+      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
+      return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
+                     internal::strides<Layout>(desc.dimensions()),
+                     /*materialized_in_output=*/false,
+                     /*strided_storage=*/false);
+    }
+  }
+
+  // Creates a materialized block for the given descriptor from a memory buffer.
+  template <typename DataDimensions, typename TensorBlockScratch>
+  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
+      const Scalar* data, const DataDimensions& data_dims,
+      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
+
+    // If a tensor block dimensions covers a contiguous block of the underlying
+    // memory, we can skip block buffer memory allocation, and construct a block
+    // from existing `data` memory buffer.
+    //
+    // Example: (RowMajor layout)
+    //   data_dims:          [11, 12, 13, 14]
+    //   desc.dimensions():  [1,   1,  3, 14]
+    //
+    // In this case we can construct a TensorBlock starting at
+    // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
+    static const bool is_col_major = Layout == ColMajor;
+
+    // Find out how many inner dimensions have a matching size.
+    int num_matching_inner_dims = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      int dim = is_col_major ? i : NumDims - i - 1;
+      if (data_dims[dim] != desc.dimensions()[dim]) break;
+      ++num_matching_inner_dims;
+    }
+
+    // All the outer dimensions must be of size `1`, except a single dimension
+    // before the matching inner dimension (`3` in the example above).
+    bool can_use_direct_access = true;
+    for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
+      int dim = is_col_major ? i : NumDims - i - 1;
+      if (desc.dimension(dim) != 1) {
+        can_use_direct_access = false;
+        break;
+      }
+    }
+
+    if (can_use_direct_access) {
+      const Scalar* block_start = data + desc.offset();
+      return TensorMaterializedBlock(internal::TensorBlockKind::kView,
+                                     block_start, desc.dimensions());
+
+    } else {
+      // Reuse destination buffer or allocate new buffer with scratch allocator.
+      const Storage storage = prepareStorage(desc, scratch);
+
+      typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
+          TensorBlockIO;
+      typedef typename TensorBlockIO::Dst TensorBlockIODst;
+      typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+      TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
+                           data, desc.offset());
+      TensorBlockIODst dst(storage.dimensions(), storage.strides(),
+                           storage.data());
+
+      TensorBlockIO::Copy(dst, src);
+      return storage.AsTensorMaterializedBlock();
+    }
+  }
+
+ private:
+  TensorBlockKind m_kind;
+  const Scalar* m_data;
+  Dimensions m_dimensions;
+  XprType m_expr;
+  bool m_valid_expr;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
+// functor to the blocks produced by the underlying Tensor expression.
+
+template <typename UnaryOp, typename ArgTensorBlock>
+class TensorCwiseUnaryBlock {
+
+  static const bool NoArgBlockAccess =
+      internal::is_void<typename ArgTensorBlock::XprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
+      type XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
+      : m_arg_block(arg_block), m_functor(functor) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+
+  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
+  const Scalar* data() const { return NULL; }
+  void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+  ArgTensorBlock m_arg_block;
+  UnaryOp m_functor;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
+// functor to the blocks produced by the underlying Tensor expression.
+
+template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
+class TensorCwiseBinaryBlock {
+
+  static const bool NoArgBlockAccess =
+      internal::is_void<typename LhsTensorBlock::XprType>::value ||
+      internal::is_void<typename RhsTensorBlock::XprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
+                          const typename RhsTensorBlock::XprType> >::type
+      XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
+                         const RhsTensorBlock& right_block,
+                         const BinaryOp& functor)
+      : m_left_block(left_block),
+        m_right_block(right_block),
+        m_functor(functor) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+
+  XprType expr() const {
+    return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
+  }
+
+  const Scalar* data() const { return NULL; }
+
+  void cleanup() {
+    m_left_block.cleanup();
+    m_right_block.cleanup();
+  }
+
+ private:
+  LhsTensorBlock m_left_block;
+  RhsTensorBlock m_right_block;
+  BinaryOp m_functor;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorUnaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from a block of the underlying type (this is a
+// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
+
+template <typename BlockFactory, typename ArgTensorBlock>
+class TensorUnaryExprBlock {
+
+  typedef typename ArgTensorBlock::XprType ArgXprType;
+  static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
+                       const BlockFactory& factory)
+      : m_arg_block(arg_block), m_factory(factory) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
+  const Scalar* data() const { return NULL; }
+  void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+  ArgTensorBlock m_arg_block;
+  BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorTernaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from three blocks of the underlying type.
+
+template <typename BlockFactory, typename Arg1TensorBlock,
+          typename Arg2TensorBlock, typename Arg3TensorBlock>
+class TensorTernaryExprBlock {
+
+  typedef typename Arg1TensorBlock::XprType Arg1XprType;
+  typedef typename Arg2TensorBlock::XprType Arg2XprType;
+  typedef typename Arg3TensorBlock::XprType Arg3XprType;
+
+  static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
+                                       internal::is_void<Arg2XprType>::value ||
+                                       internal::is_void<Arg3XprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
+                                              Arg3XprType>::type>::type XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
+                         const Arg2TensorBlock& arg2_block,
+                         const Arg3TensorBlock& arg3_block,
+                         const BlockFactory& factory)
+      : m_arg1_block(arg1_block),
+        m_arg2_block(arg2_block),
+        m_arg3_block(arg3_block),
+        m_factory(factory) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+  XprType expr() const {
+    return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
+                          m_arg3_block.expr());
+  }
+  const Scalar* data() const { return NULL; }
+  void cleanup() {
+    m_arg1_block.cleanup();
+    m_arg2_block.cleanup();
+    m_arg3_block.cleanup();
+  }
+
+ private:
+  Arg1TensorBlock m_arg1_block;
+  Arg2TensorBlock m_arg2_block;
+  Arg3TensorBlock m_arg3_block;
+  BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// StridedLinearBufferCopy provides a method to copy data between two linear
+// buffers with different strides, with optimized paths for scatter/gather.
+
+template <typename Scalar, typename IndexType>
+class StridedLinearBufferCopy {
+  typedef typename packet_traits<Scalar>::type Packet;
+  enum {
+    Vectorizable = packet_traits<Scalar>::Vectorizable,
+    PacketSize = packet_traits<Scalar>::size
+  };
+
+ public:
+  // Specifying linear copy kind statically gives ~30% speedup for small sizes.
+  enum Kind {
+    Linear = 0,       // src_stride == 1 && dst_stride == 1
+    Scatter = 1,      // src_stride == 1 && dst_stride != 1
+    FillLinear = 2,   // src_stride == 0 && dst_stride == 1
+    FillScatter = 3,  // src_stride == 0 && dst_stride != 1
+    Gather = 4,       // dst_stride == 1
+    Random = 5        // everything else
+  };
+
+  struct Dst {
+    Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
+
+    IndexType offset;
+    IndexType stride;
+    Scalar* data;
+  };
+
+  struct Src {
+    Src(IndexType o, IndexType s, const Scalar* d)
+        : offset(o), stride(s), data(d) {}
+
+    IndexType offset;
+    IndexType stride;
+    const Scalar* data;
+  };
+
+  template <StridedLinearBufferCopy::Kind kind>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
+                                                        const Src& src,
+                                                        const size_t count) {
+    Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
+              src.data);
+  }
+
+ private:
+  template <StridedLinearBufferCopy::Kind kind>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const IndexType count, const IndexType dst_offset,
+      const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
+      const IndexType src_offset, const IndexType src_stride,
+      const Scalar* EIGEN_RESTRICT src_data) {
+    const Scalar* src = &src_data[src_offset];
+    Scalar* dst = &dst_data[dst_offset];
+
+    if (!Vectorizable) {
+      for (Index i = 0; i < count; ++i) {
+        dst[i * dst_stride] = src[i * src_stride];
+      }
+      return;
+    }
+
+    const IndexType vectorized_size = count - PacketSize;
+    IndexType i = 0;
+
+    if (kind == Linear) {
+      // ******************************************************************** //
+      // Linear copy from `src` to `dst`.
+      const IndexType unrolled_size = count - 4 * PacketSize;
+      eigen_assert(src_stride == 1 && dst_stride == 1);
+      for (; i <= unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          Packet p = ploadu<Packet>(src + i + j * PacketSize);
+          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+        }
+      }
+      for (; i <= vectorized_size; i += PacketSize) {
+        Packet p = ploadu<Packet>(src + i);
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      for (; i < count; ++i) {
+        dst[i] = src[i];
+      }
+      // ******************************************************************** //
+    } else if (kind == Scatter) {
+      // Scatter from `src` to `dst`.
+      eigen_assert(src_stride == 1 && dst_stride != 1);
+      for (; i <= vectorized_size; i += PacketSize) {
+        Packet p = ploadu<Packet>(src + i);
+        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
+      }
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = src[i];
+      }
+      // ******************************************************************** //
+    } else if (kind == FillLinear) {
+      // Fill `dst` with value at `*src`.
+      eigen_assert(src_stride == 0 && dst_stride == 1);
+      const IndexType unrolled_size = count - 4 * PacketSize;
+      Packet p = pload1<Packet>(src);
+      for (; i <= unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+        }
+      }
+      for (; i <= vectorized_size; i += PacketSize) {
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      for (; i < count; ++i) {
+        dst[i] = *src;
+      }
+      // ******************************************************************** //
+    } else if (kind == FillScatter) {
+      // Scatter `*src` into `dst`.
+      eigen_assert(src_stride == 0 && dst_stride != 1);
+      Packet p = pload1<Packet>(src);
+      for (; i <= vectorized_size; i += PacketSize) {
+        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
+      }
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = *src;
+      }
+      // ******************************************************************** //
+    } else if (kind == Gather) {
+      // Gather from `src` into `dst`.
+      eigen_assert(dst_stride == 1);
+      for (; i <= vectorized_size; i += PacketSize) {
+        Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      for (; i < count; ++i) {
+        dst[i] = src[i * src_stride];
+      }
+      // ******************************************************************** //
+    } else if (kind == Random) {
+      // Random.
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = src[i * src_stride];
+      }
+    } else {
+      eigen_assert(false);
+    }
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
+// It's possible to specify src->dst dimension mapping for the copy operation.
+// Dimensions of `dst` specify how many elements have to be copied, for the
+// `src` we need to know only stride to navigate through source memory buffer.
+
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIO {
+  static const bool IsColMajor = (Layout == ColMajor);
+
+  typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
+
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+  typedef DSizes<int, NumDims> DimensionsMap;
+
+  struct Dst {
+    Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
+        IndexType dst_offset = 0)
+        : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
+
+    Dimensions dims;
+    Dimensions strides;
+    Scalar* data;
+    IndexType offset;
+  };
+
+  struct Src {
+    Src(const Dimensions& src_strides, const Scalar* src,
+        IndexType src_offset = 0)
+        : strides(src_strides), data(src), offset(src_offset) {}
+
+    Dimensions strides;
+    const Scalar* data;
+    IndexType offset;
+  };
+
+  // Copies data to `dst` from `src`, using provided dimensions mapping:
+  //
+  //   src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
+  //
+  // Returns the number of copied elements.
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
+      const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
+    // Copy single scalar value from `src` to `dst`.
+    if (NumDims == 0) {
+      *(dst.data + dst.offset) = *(src.data + src.offset);
+      return 1;
+    }
+
+    // Both `dst` and `src` must have contiguous innermost dimension. We also
+    // accept the special case with stride '0', because it's used as a trick to
+    // implement broadcasting.
+    {
+      int inner_dim = IsColMajor ? 0 : NumDims - 1;
+      EIGEN_UNUSED_VARIABLE(inner_dim);
+      eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
+      eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
+    }
+
+    // Give a shorter name to `dst_to_src_dim_map`.
+    const DimensionsMap& dim_map = dst_to_src_dim_map;
+
+    // Do not squeeze reordered inner dimensions.
+    int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
+
+    // NOTE: We find the innermost dimension (contiguous in memory) in the dst
+    // block, and we write data linearly into that dimension, reading it from
+    // the src. If dimensions are reordered, we might end up reading data from
+    // the src with `stride != 1`.
+    //
+    // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
+    // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
+
+    // Find the innermost dimension in the dst whose size is not 1. This is the
+    // effective inner dim.
+    int num_size_one_inner_dims = 0;
+    for (int i = 0; i < num_squeezable_dims; ++i) {
+      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
+      if (dst.dims[dst_dim] != 1) break;
+      num_size_one_inner_dims++;
+    }
+
+    // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
+    if (num_size_one_inner_dims == NumDims) {
+      *(dst.data + dst.offset) = *(src.data + src.offset);
+      return 1;
+    }
+
+    // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
+    const int dst_stride1_dim = IsColMajor
+                                    ? num_size_one_inner_dims
+                                    : NumDims - num_size_one_inner_dims - 1;
+
+    // Dimension in the src that corresponds to the dst innermost dimension.
+    const int src_dim_for_dst_stride1_dim =
+        NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
+
+    // Size of the innermost dimension (length of contiguous blocks of memory).
+    IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
+
+    // Squeeze multiple inner dims into one if they are contiguous in `dst` and
+    // `src` memory, so we can do less linear copy calls.
+    for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
+      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
+      const IndexType dst_stride = dst.strides[dst_dim];
+      const IndexType src_stride = src.strides[dim_map[dst_dim]];
+      if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
+        dst_inner_dim_size *= dst.dims[dst_dim];
+        ++num_size_one_inner_dims;
+      } else {
+        break;
+      }
+    }
+
+    // Setup strides to read data from `src` and write to `dst`.
+    IndexType input_offset = src.offset;
+    IndexType output_offset = dst.offset;
+    IndexType input_stride =
+        NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
+    IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
+
+    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
+    array<BlockIteratorState, at_least_1_dim> it;
+
+    // Initialize block iterator state. Squeeze away any dimension of size 1.
+    int idx = 0;  // currently initialized iterator state index
+    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
+      const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
+      if (dst.dims[dst_dim] == 1) continue;
+
+      it[idx].size = dst.dims[dst_dim];
+      it[idx].input_stride = src.strides[dim_map[dst_dim]];
+      it[idx].output_stride = dst.strides[dst_dim];
+
+      it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+
+      idx++;
+    }
+
+    // Iterate copying data from src to dst.
+    const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
+
+#define COPY_INNER_DIM(KIND)                                           \
+  IndexType num_copied = 0;                                            \
+  for (num_copied = 0; num_copied < block_total_size;                  \
+       num_copied += dst_inner_dim_size) {                             \
+    LinCopy::template Run<KIND>(                                       \
+        typename LinCopy::Dst(output_offset, output_stride, dst.data), \
+        typename LinCopy::Src(input_offset, input_stride, src.data),   \
+        dst_inner_dim_size);                                           \
+                                                                       \
+    for (int j = 0; j < idx; ++j) {                                    \
+      if (++it[j].count < it[j].size) {                                \
+        input_offset += it[j].input_stride;                            \
+        output_offset += it[j].output_stride;                          \
+        break;                                                         \
+      }                                                                \
+      it[j].count = 0;                                                 \
+      input_offset -= it[j].input_span;                                \
+      output_offset -= it[j].output_span;                              \
+    }                                                                  \
+  }                                                                    \
+  return num_copied;
+
+    if (input_stride == 1 && output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::Linear);
+    } else if (input_stride == 1 && output_stride != 1) {
+      COPY_INNER_DIM(LinCopy::Scatter);
+    } else if (input_stride == 0 && output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::FillLinear);
+    } else if (input_stride == 0 && output_stride != 1) {
+      COPY_INNER_DIM(LinCopy::FillScatter);
+    } else if (output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::Gather);
+    } else {
+      COPY_INNER_DIM(LinCopy::Random);
+    }
+
+#undef COPY_INNER_DIM
+  }
+
+  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
+  // the number of copied elements.
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
+                                                              const Src& src) {
+    DimensionsMap dst_to_src_map;
+    for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
+    return Copy(dst, src, dst_to_src_map);
+  }
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : size(0),
+          count(0),
+          input_stride(0),
+          output_stride(0),
+          input_span(0),
+          output_span(0) {}
+
+    IndexType size;
+    IndexType count;
+    IndexType input_stride;
+    IndexType output_stride;
+    IndexType input_span;
+    IndexType output_span;
+  };
+
+  // Compute how many inner dimensions it's allowed to squeeze when doing IO
+  // between two tensor blocks. It's safe to squeeze inner dimensions, only
+  // if they are not reordered.
+  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
+    int num_squeezable_dims = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      if (dim_map[dim] != dim) break;
+      num_squeezable_dims++;
+    }
+    return num_squeezable_dims;
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
+// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
+//
+// Currently there is no way to write from a Tensor expression to a block of
+// memory, if dimensions are reordered. If you need to do that, you should
+// materialize a Tensor block expression into a memory buffer, and then use
+// TensorBlockIO to copy data between two memory buffers with a custom
+// `target->src` dimension map (see definition above).
+//
+// Also currently the innermost dimension of `target` must have a stride '1'
+// (contiguous in memory). This restriction could be lifted with a `pscatter`,
+// but in practice it's never needed, and there is a similar TensorBlockIO
+// workaround for that.
+//
+// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
+// where `src` is a tensor expression. Explore if it is possible to rewrite IO
+// to use expressions instead of pointers, and after that TensorBlockAssignment
+// will become an alias to IO.
+template <typename Scalar, int NumDims, typename TensorBlockExpr,
+          typename IndexType = Eigen::Index>
+class TensorBlockAssignment {
+  // We will use coeff/packet path to evaluate block expressions.
+  typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
+      TensorBlockEvaluator;
+
+  typedef DSizes<IndexType, NumDims> Dimensions;
+
+  enum {
+    Vectorizable = packet_traits<Scalar>::Vectorizable,
+    PacketSize = packet_traits<Scalar>::size
+  };
+
+  template <bool Vectorizable, typename Evaluator>
+  struct InnerDimAssign {
+    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
+                                        const Evaluator& eval,
+                                        IndexType eval_offset) {
+      for (IndexType i = 0; i < count; ++i) {
+        target[i] = eval.coeff(eval_offset + i);
+      }
+    }
+  };
+
+  template <typename Evaluator>
+  struct InnerDimAssign<true, Evaluator> {
+    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
+                                        const Evaluator& eval,
+                                        IndexType eval_offset) {
+      typedef typename packet_traits<Scalar>::type Packet;
+
+      const IndexType unrolled_size = count - 4 * PacketSize;
+      const IndexType vectorized_size = count - PacketSize;
+      IndexType i = 0;
+
+      for (; i <= unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          const IndexType idx = eval_offset + i + j * PacketSize;
+          Packet p = eval.template packet<Unaligned>(idx);
+          pstoreu<Scalar>(target + i + j * PacketSize, p);
+        }
+      }
+
+      for (; i <= vectorized_size; i += PacketSize) {
+        Packet p = eval.template packet<Unaligned>(eval_offset + i);
+        pstoreu<Scalar>(target + i, p);
+      }
+
+      for (; i < count; ++i) {
+        target[i] = eval.coeff(eval_offset + i);
+      }
+    }
+  };
+
+ public:
+  struct Target {
+    Target(const Dimensions& target_dims, const Dimensions& target_strides,
+           Scalar* target_data, IndexType target_offset = 0)
+        : dims(target_dims),
+          strides(target_strides),
+          data(target_data),
+          offset(target_offset) {}
+
+    Dimensions dims;
+    Dimensions strides;
+    Scalar* data;
+    IndexType offset;
+  };
+
+  static Target target(const Dimensions& target_dims,
+                       const Dimensions& target_strides, Scalar* target_data,
+                       IndexType target_offset = 0) {
+    return Target(target_dims, target_strides, target_data, target_offset);
+  }
+
+  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
+  static Target target(
+      const DSizes<TargetDimsIndexType, NumDims>& target_dims,
+      const DSizes<TargetStridesIndexType, NumDims>& target_strides,
+      Scalar* target_data, IndexType target_offset = 0) {
+    // DSizes constructor will do index type promotion if it's safe.
+    return Target(Dimensions(target_dims), Dimensions(target_strides),
+                  target_data, target_offset);
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Target& target, const TensorBlockExpr& expr) {
+    // Prepare evaluator for block expression.
+    DefaultDevice default_device;
+    TensorBlockEvaluator eval(expr, default_device);
+
+    // Tensor block expression dimension should match destination dimensions.
+    eigen_assert(dimensions_match(target.dims, eval.dimensions()));
+
+    static const int Layout = TensorBlockEvaluator::Layout;
+    static const bool is_col_major = Layout == ColMajor;
+
+    // Initialize output inner dimension size based on a layout.
+    const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
+    const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
+    IndexType output_inner_dim_size = target.dims[inner_dim_idx];
+
+    // Target inner dimension stride must be '1'.
+    eigen_assert(target.strides[inner_dim_idx] == 1);
+
+    // Squeeze multiple inner dims into one if they are contiguous in `target`.
+    IndexType num_squeezed_dims = 0;
+    for (Index i = 1; i < NumDims; ++i) {
+      const Index dim = is_col_major ? i : NumDims - i - 1;
+      const IndexType target_stride = target.strides[dim];
+
+      if (output_inner_dim_size == target_stride) {
+        output_inner_dim_size *= target.dims[dim];
+        num_squeezed_dims++;
+      } else {
+        break;
+      }
+    }
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+
+    int idx = 0;  // currently initialized iterator state index
+    for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
+      const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
+
+      it[idx].count = 0;
+      it[idx].size = target.dims[dim];
+      it[idx].output_stride = target.strides[dim];
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+      idx++;
+    }
+
+    // We read block expression from the beginning, and start writing data to
+    // `target` at given offset.
+    IndexType input_offset = 0;
+    IndexType output_offset = target.offset;
+
+    // Iterate copying data from `eval` to `target`.
+    for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
+      // Assign to `target` at current offset.
+      InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
+                     TensorBlockEvaluator>::Run(target.data + output_offset,
+                                                output_inner_dim_size, eval,
+                                                input_offset);
+
+      // Move input offset forward by the number of assigned coefficients.
+      input_offset += output_inner_dim_size;
+
+      // Update index.
+      for (int j = 0; j < idx; ++j) {
+        if (++it[j].count < it[j].size) {
+          output_offset += it[j].output_stride;
+          break;
+        }
+        it[j].count = 0;
+        output_offset -= it[j].output_span;
+      }
+    }
+  }
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : count(0), size(0), output_stride(0), output_span(0) {}
+
+    IndexType count;
+    IndexType size;
+    IndexType output_stride;
+    IndexType output_span;
+  };
+};
+
+// -------------------------------------------------------------------------- //
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
deleted file mode 100644
index 029180ca5..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
+++ /dev/null
@@ -1,1481 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H
-#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H
-
-namespace Eigen {
-namespace internal {
-
-// -------------------------------------------------------------------------- //
-// Forward declarations for templates defined below.
-template <typename Scalar, typename IndexType, int NumDims, int Layout>
-class TensorBlockIOV2;
-
-// -------------------------------------------------------------------------- //
-// Helper function to compute strides for densely stored buffer of given
-// dimensions.
-
-// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
-// this function instead everywhere.
-template <int Layout, typename IndexType, int NumDims>
-EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
-    const DSizes<IndexType, NumDims>& dimensions) {
-  DSizes<IndexType, NumDims> strides;
-  if (NumDims == 0) return strides;
-
-  // TODO(ezhulenev): Use templates to unroll this loop (similar to
-  // h_array_reduce in CXX11meta.h)? Benchmark it.
-  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-    strides[0] = 1;
-    for (int i = 1; i < NumDims; ++i) {
-      strides[i] = strides[i - 1] * dimensions[i - 1];
-    }
-  } else {
-    strides[NumDims - 1] = 1;
-    for (int i = NumDims - 2; i >= 0; --i) {
-      strides[i] = strides[i + 1] * dimensions[i + 1];
-    }
-  }
-
-  return strides;
-}
-
-template <int Layout, typename IndexType, size_t NumDims>
-EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
-    const Eigen::array<IndexType, NumDims>& dimensions) {
-  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
-}
-
-template <int Layout, std::ptrdiff_t... Indices>
-EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
-    const Sizes<Indices...>& sizes) {
-  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
-}
-
-// -------------------------------------------------------------------------- //
-
-// Tensor block shape type defines what are the shape preference for the blocks
-// extracted from the larger tensor.
-//
-// Example: blocks of 100 elements from the large 100x100 tensor:
-// - tensor: 100x100
-// - target_block_size: 100
-//
-// TensorBlockShapeType:
-//  - kUniformAllDims: 100 blocks of size 10x10
-//  - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
-//                      or row major layout)
-enum class TensorBlockV2ShapeType { kUniformAllDims, kSkewedInnerDims };
-
-struct TensorBlockV2ResourceRequirements {
-  TensorBlockV2ShapeType shape_type;
-  size_t size;
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockV2ResourceRequirements
-  merge(const TensorBlockV2ResourceRequirements &lhs,
-        const TensorBlockV2ResourceRequirements &rhs) {
-    return {merge(lhs.shape_type, rhs.shape_type), merge(rhs.size, lhs.size)};
-  }
-
-  // This is a resource requirement that should be returned from expressions
-  // that do not have any block evaluation preference (e.g. default tensor
-  // expression with raw buffer access).
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockV2ResourceRequirements any() {
-    return {TensorBlockV2ShapeType::kUniformAllDims, 1};
-  }
-
-private:
-  using Requirements = TensorBlockV2ResourceRequirements;
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
-    return numext::maxi(lhs_size, rhs_size);
-  }
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockV2ShapeType merge(TensorBlockV2ShapeType lhs,
-							  TensorBlockV2ShapeType rhs) {
-    return (lhs == TensorBlockV2ShapeType::kSkewedInnerDims ||
-            rhs == TensorBlockV2ShapeType::kSkewedInnerDims)
-               ? TensorBlockV2ShapeType::kSkewedInnerDims
-               : TensorBlockV2ShapeType::kUniformAllDims;
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockDescriptor specifies a block offset within a tensor and the block
-// sizes along each of the tensor dimensions.
-
-template <int NumDims, typename IndexType = Eigen::Index>
-class TensorBlockDescriptor {
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  // If we evaluate a Tensor assignment, and expression on the left, already has
-  // a memory buffer, then we might do performance optimization, and evaluate
-  // the root expression directly into the final output memory. Some time it's
-  // possible to reuse it for materializing subexpressions inside an expression
-  // tree, to to avoid dynamic memory allocation.
-  //
-  // The pointer type of the underlying storage is erased, because passing
-  // Scalar type through all the expression evaluation layers is way too many
-  // templates. In practice destination buffer type should always match the
-  // evaluated expression scalar type.
-  class DestinationBuffer {
-   public:
-    enum DestinationBufferKind : int {
-      // The above explicit specification of "int" as the enum basetype is needed
-      // to get around a HIPCC link error ("the field type is not amp-compatible")
-      // which is issued for class members with the enum type.
-      // TODO(rocm):
-      // remove the "int" basetype once HIPCC has been fixed to not error out
-      // in the above scenario.
-
-      // Destination buffer is not defined (`m_data` == nullptr).
-      kEmpty,
-
-      // Tensor block defined by an owning tensor block descriptor can fit
-      // contiguously into the destination buffer. In this case it's safe to
-      // materialize tensor block in the destination buffer, wrap it in a
-      // TensorMap, and use to build Eigen expression on top of it.
-      kContiguous,
-
-      // Destination buffer strides do not match strides of the contiguously
-      // stored block, and it's impossible to define a TensorMap over this
-      // buffer. However if we are evaluating a root of an expression tree, we
-      // still can materialize an output into this destination, because we can
-      // guarantee that no one will ever access it through block API.
-      //
-      // In theory it is possible to build valid TensorStriding<TensorMap>
-      // expression on top of this destination buffer, however it has
-      // inefficient coeff/packet access, and defeats the purpose of fast block
-      // evaluation API.
-      kStrided
-    };
-
-    template <typename Scalar>
-    Scalar* data() const {
-      eigen_assert(m_data_type_size == sizeof(Scalar));
-      return static_cast<Scalar*>(m_data);
-    }
-
-    const Dimensions& strides() const { return m_strides; }
-    const DestinationBufferKind& kind() const { return m_kind; }
-
-   private:
-    friend class TensorBlockDescriptor;
-
-    DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
-
-    template <typename Scalar>
-    DestinationBuffer(Scalar* data, const Dimensions& strides,
-                      DestinationBufferKind kind)
-        : m_data(static_cast<void*>(data)),
-          m_data_type_size(sizeof(Scalar)),
-          m_strides(strides),
-          m_kind(kind) {}
-
-    template <int Layout, typename Scalar>
-    static DestinationBuffer make(const TensorBlockDescriptor& desc,
-                                  Scalar* data, const Dimensions& strides) {
-      return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
-    }
-
-    template <int Layout>
-    static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
-                                      const Dimensions& strides) {
-      const Dimensions& desc_dims = desc.dimensions();
-      const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
-      for (int i = 0; i < NumDims; ++i) {
-        if (desc_dims[i] == 1) continue;
-        if (desc_strides[i] != strides[i]) return kStrided;
-      }
-      return kContiguous;
-    }
-
-    // Storage pointer is type erased, to reduce template bloat, but we still
-    // keep the size of the underlying element type for error checking.
-    void* m_data;
-    size_t m_data_type_size;
-
-    // Destination buffer dimensions always match the dimensions of a tensor
-    // block descriptor it belongs to, however strides might be different.
-    Dimensions m_strides;
-
-    DestinationBufferKind m_kind;
-  };
-
-  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
-                        const DestinationBuffer& destination)
-      : m_offset(offset),
-        m_dimensions(dimensions),
-        m_destination(destination) {}
-
-  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
-      : m_offset(offset),
-        m_dimensions(dimensions),
-        m_destination(DestinationBuffer()) {}
-
-  IndexType offset() const { return m_offset; }
-  const Dimensions& dimensions() const { return m_dimensions; }
-  IndexType dimension(int index) const { return m_dimensions[index]; }
-  IndexType size() const { return array_prod<IndexType>(m_dimensions); }
-
-  const DestinationBuffer& destination() const { return m_destination; }
-
-  template <int Layout, typename Scalar>
-  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
-    eigen_assert(dst_base != NULL);
-    m_destination =
-        DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
-  }
-
-  template <int Layout, typename Scalar, typename DstStridesIndexType>
-  void AddDestinationBuffer(
-      Scalar* dst_base,
-      const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
-    // DSizes constructor will do index type promotion if it's safe.
-    AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
-  }
-
-  TensorBlockDescriptor& DropDestinationBuffer() {
-    m_destination.m_data = NULL;
-    m_destination.m_kind = DestinationBuffer::kEmpty;
-    return *this;
-  }
-
-  bool HasDestinationBuffer() const {
-    return m_destination.kind() != DestinationBuffer::kEmpty;
-  }
-
-  // Returns a copy of `*this` with updated offset.
-  TensorBlockDescriptor WithOffset(IndexType offset) const {
-    return TensorBlockDescriptor(offset, m_dimensions, m_destination);
-  }
-
- private:
-  // Offset and dimensions are immutable after construction. Block descriptor
-  // can only be mutated by adding or dropping destination.
-  const IndexType m_offset;
-  const Dimensions m_dimensions;
-  DestinationBuffer m_destination;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
-
-template <int NumDims, int Layout, typename IndexType = Eigen::Index>
-class TensorBlockV2Mapper {
-  typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
-
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  TensorBlockV2Mapper() = default;
-  TensorBlockV2Mapper(const DSizes<IndexType, NumDims>& dimensions,
-                      const TensorBlockV2ResourceRequirements& requirements)
-      : m_tensor_dimensions(dimensions), m_requirements(requirements) {
-    // Initialize `m_block_dimensions`.
-    InitializeBlockDimensions();
-
-    // Calculate block counts by dimension and total block count.
-    DSizes<IndexType, NumDims> block_count;
-    for (int i = 0; i < NumDims; ++i) {
-      block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
-    }
-    m_total_block_count = array_prod(block_count);
-
-    // Calculate block strides (used for enumerating blocks).
-    m_tensor_strides = strides<Layout>(m_tensor_dimensions);
-    m_block_strides = strides<Layout>(block_count);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
-    return m_total_block_count;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
-    return m_block_dimensions.TotalSize();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
-  blockDimensions() const {
-    return m_block_dimensions;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  BlockDescriptor blockDescriptor(IndexType block_index) const {
-    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
-
-    IndexType offset = 0;
-    DSizes<IndexType, NumDims> dimensions;
-
-    if (NumDims == 0) return BlockDescriptor(offset, dimensions);
-
-    // Iterate outer -> inner dimensions.
-    for (int i = NumDims - 1; i >= 0; --i) {
-      const int dim = isColMajor ? i : NumDims - i - 1;
-
-      const IndexType idx = block_index / m_block_strides[dim];
-      block_index -= idx * m_block_strides[dim];
-
-      const IndexType coord = idx * m_block_dimensions[dim];
-      dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
-                                     m_block_dimensions[dim]);
-      offset += coord * m_tensor_strides[dim];
-    }
-
-    return {offset, dimensions};
-  }
-
- private:
-  void InitializeBlockDimensions() {
-    // Requested block shape and size.
-    const TensorBlockV2ShapeType shape_type = m_requirements.shape_type;
-    const IndexType target_block_size =
-        numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
-
-    // Corner case: one of the dimensions is zero. Logic below is too complex
-    // to handle this case on a general basis, just use unit block size.
-    // Note: we must not yield blocks with zero dimensions (recipe for
-    // overflows/underflows, divisions by zero and NaNs later).
-    if (m_tensor_dimensions.TotalSize() == 0) {
-      for (int i = 0; i < NumDims; ++i) {
-        m_block_dimensions[i] = 1;
-      }
-      return;
-    }
-
-    // If tensor fits into a target block size, evaluate it as a single block.
-    if (m_tensor_dimensions.TotalSize() <= target_block_size) {
-      m_block_dimensions = m_tensor_dimensions;
-      return;
-    }
-
-    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
-
-    // Block shape skewed towards inner dimension.
-    if (shape_type == TensorBlockV2ShapeType::kSkewedInnerDims) {
-      IndexType coeff_to_allocate = target_block_size;
-
-      for (int i = 0; i < NumDims; ++i) {
-        const int dim = isColMajor ? i : NumDims - i - 1;
-        m_block_dimensions[dim] =
-            numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
-        coeff_to_allocate = divup(
-            coeff_to_allocate,
-            numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
-      }
-      eigen_assert(coeff_to_allocate == 1);
-
-    } else if (shape_type == TensorBlockV2ShapeType::kUniformAllDims) {
-      // Tensor will not fit within 'target_block_size' budget: calculate tensor
-      // block dimension sizes based on "square" dimension size target.
-      const IndexType dim_size_target = convert_index<IndexType>(
-          std::pow(static_cast<float>(target_block_size),
-                   1.0f / static_cast<float>(m_block_dimensions.rank())));
-
-      for (int i = 0; i < NumDims; ++i) {
-        // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
-        // a multiple of the packet size. Note that reducing
-        // 'block_dim_size' in this manner can increase the number of
-        // blocks, and so will amplify any per-block overhead.
-        m_block_dimensions[i] =
-            numext::mini(dim_size_target, m_tensor_dimensions[i]);
-      }
-
-      // Add any un-allocated coefficients to inner dimension(s).
-      IndexType total_size = m_block_dimensions.TotalSize();
-      for (int i = 0; i < NumDims; ++i) {
-        const int dim = isColMajor ? i : NumDims - i - 1;
-
-        if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
-          const IndexType total_size_other_dims =
-              total_size / m_block_dimensions[dim];
-          const IndexType alloc_avail =
-              divup<IndexType>(target_block_size, total_size_other_dims);
-          if (alloc_avail == m_block_dimensions[dim]) {
-            // Insufficient excess coefficients to allocate.
-            break;
-          }
-          m_block_dimensions[dim] =
-              numext::mini(m_tensor_dimensions[dim], alloc_avail);
-          total_size = total_size_other_dims * m_block_dimensions[dim];
-        }
-      }
-
-    } else {
-      eigen_assert(false);  // unknown block shape
-    }
-
-    eigen_assert(m_block_dimensions.TotalSize() >=
-                 numext::mini<IndexType>(target_block_size,
-                                     m_tensor_dimensions.TotalSize()));
-  }
-
-  DSizes<IndexType, NumDims> m_tensor_dimensions;
-  TensorBlockV2ResourceRequirements m_requirements;
-
-  DSizes<IndexType, NumDims> m_block_dimensions;
-  IndexType m_total_block_count;
-
-  DSizes<IndexType, NumDims> m_tensor_strides;
-  DSizes<IndexType, NumDims> m_block_strides;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockScratchAllocator is responsible for allocating temporary buffers
-// for block evaluation (output or input block materialization). Given that
-// Eigen expression traversal order is deterministic, all temporary allocations
-// are happening in the same order, and usually have exactly the same size.
-// Scratch allocator keeps a trace of all dynamic allocations, and after the
-// first block evaluation is completed, we should be able to reuse all the
-// temporary buffers for the next block evaluation.
-
-template <typename Device>
-class TensorBlockScratchAllocator {
- public:
-  explicit TensorBlockScratchAllocator(const Device& device)
-      : m_device(device), m_allocation_index(0) {}
-
-  ~TensorBlockScratchAllocator() {
-    for (size_t i = 0; i < m_allocations.size(); ++i) {
-      m_device.deallocate(m_allocations[i].ptr);
-    }
-  }
-
-  void* allocate(size_t size) {
-    // TODO(ezhulenev): Remove when replaced with inlined vector.
-    if (m_allocations.capacity() == 0) m_allocations.reserve(8);
-
-    // Check if we already have an existing allocation att current index.
-    const int num_allocations = static_cast<int>(m_allocations.size());
-    const bool has_allocation = m_allocation_index < num_allocations;
-
-    // Allocation index can't be larger than the number of allocations.
-    eigen_assert(m_allocation_index <= num_allocations);
-
-    // If we have existing allocation, and its size is larger or equal to
-    // requested size, we do nothing.
-
-    // If current allocation can't fit requested size, we deallocate it, and
-    // replace with a larger allocation.
-    if (has_allocation && m_allocations[m_allocation_index].size < size) {
-      m_device.deallocate(m_allocations[m_allocation_index].ptr);
-      m_allocations[m_allocation_index].ptr = m_device.allocate(size);
-      m_allocations[m_allocation_index].size = size;
-    }
-
-    // Make a new allocation if we don't have and existing one.
-    if (!has_allocation) {
-      Allocation allocation;
-      allocation.ptr = m_device.allocate(size);
-      allocation.size = size;
-      m_allocations.push_back(allocation);
-    }
-
-    eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
-    eigen_assert(m_allocations[m_allocation_index].size >= size);
-
-    return m_allocations[m_allocation_index++].ptr;
-  }
-
-  void reset() { m_allocation_index = 0; }
-
- private:
-  struct Allocation {
-    void* ptr;
-    size_t size;
-  };
-
-  const Device& m_device;
-  int m_allocation_index;
-  // TODO(ezhulenev): This should be an inlined vector.
-  std::vector<Allocation> m_allocations;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockKind represents all possible block kinds, that can be produced by
-// TensorEvaluator::evalBlock function.
-enum TensorBlockKind {
-  // Tensor block that is a lazy expression that must be assigned to a
-  // destination using TensorBlockAssign.
-  kExpr,
-
-  // Tensor block that is a view into a memory buffer owned by an underlying
-  // Tensor expression (e.g. it can be a view into a Tensor buffer).
-  kView,
-
-  // Tensor block that was materialized in a scratch memory buffer, allocated
-  // with TensorBlockScratchAllocator. This block must be copied to a
-  // destination, similar to a block of `kExpr` type.
-  kMaterializedInScratch,
-
-  // Tensor block that was materialized directly into the final output memory
-  // buffer. For example if the left side of an assignment is a Tensor, we can
-  // directly materialize the block in the destination memory.
-  //
-  // If strides in the output buffer do not match tensor block strides, the
-  // Tensor expression will be invalid, and should not be used by
-  // TensorBlockAssign or for constructing another block expression.
-  kMaterializedInOutput
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
-// TensorEvaluators that do not support block evaluation.
-
-class TensorBlockNotImplemented {
- public:
-  typedef void XprType;
-};
-
-// -------------------------------------------------------------------------- //
-// XprScalar extracts Scalar type from the Eigen expressions (if expression type
-// is not void). It's required to be able to define lazy block expression for
-// argument types, that do not support block evaluation.
-
-template <typename XprType>
-struct XprScalar {
-  typedef typename XprType::Scalar type;
-};
-template <>
-struct XprScalar<void> {
-  typedef void type;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorMaterializedBlock is a fully evaluated block of the original tensor,
-// and XprType is just a TensorMap over the data. This block type is typically
-// used to materialize blocks of tensor expressions, that can't be efficiently
-// represented as lazy Tensor expressions with fast coeff/packet operations,
-// e.g. we materialize all broadcasts into evaluated blocks.
-//
-// TensorMaterializedBlock does not own its memory buffer, it's either a memory
-// buffer that backs the original expression (e.g. block is just a view into a
-// Tensor), or a memory buffer allocated with scratch allocator, and in this
-// case the scratch allocator will deallocate it at the end of block based
-// expression execution.
-//
-// If the block was evaluated directly into the output buffer, and strides in
-// the output buffer do not match block strides, the TensorMap expression will
-// be invalid, and should never be used in block assignment or any other tensor
-// expression.
-
-template <typename Scalar, int NumDims, int Layout,
-          typename IndexType = Eigen::Index>
-class TensorMaterializedBlock {
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-  typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
-
-  TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
-                          const Dimensions& dimensions, bool valid_expr = true)
-      : m_kind(kind),
-        m_data(data),
-        m_dimensions(dimensions),
-        m_expr(m_data, m_dimensions),
-        m_valid_expr(valid_expr) {
-    eigen_assert(m_kind == internal::TensorBlockKind::kView ||
-                 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
-                 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
-  }
-
-  TensorBlockKind kind() const { return m_kind; }
-  // NOTE(ezhulenev): Returning XprType by value like in other block types
-  // causes asan failures. The theory is that XprType::Nested doesn't work
-  // properly for TensorMap.
-  const XprType& expr() const {
-    eigen_assert(m_valid_expr);
-    return m_expr;
-  }
-  const Scalar* data() const { return m_data; }
-  void cleanup() {}
-
-  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
-
-  // TensorMaterializedBlock can be backed by different types of storage:
-  //
-  //   (1) Contiguous block of memory allocated with scratch allocator.
-  //   (2) Contiguous block of memory reused from tensor block descriptor
-  //       destination buffer.
-  //   (3) Strided block of memory reused from tensor block descriptor
-  //       destination buffer.
-  //
-  class Storage {
-   public:
-    Scalar* data() const { return m_data; }
-    const Dimensions& dimensions() const { return m_dimensions; }
-    const Dimensions& strides() const { return m_strides; }
-
-    TensorMaterializedBlock AsTensorMaterializedBlock() const {
-      return TensorMaterializedBlock(
-          m_materialized_in_output
-              ? internal::TensorBlockKind::kMaterializedInOutput
-              : internal::TensorBlockKind::kMaterializedInScratch,
-          m_data, m_dimensions, !m_strided_storage);
-    }
-
-   private:
-    friend class TensorMaterializedBlock;
-
-    Storage(Scalar* data, const Dimensions& dimensions,
-            const Dimensions& strides, bool materialized_in_output,
-            bool strided_storage)
-        : m_data(data),
-          m_dimensions(dimensions),
-          m_strides(strides),
-          m_materialized_in_output(materialized_in_output),
-          m_strided_storage(strided_storage) {}
-
-    Scalar* m_data;
-    Dimensions m_dimensions;
-    Dimensions m_strides;
-    bool m_materialized_in_output;
-    bool m_strided_storage;
-  };
-
-  // Creates a storage for materialized block either from the block descriptor
-  // destination buffer, or allocates a new buffer with scratch allocator.
-  template <typename TensorBlockScratch>
-  EIGEN_STRONG_INLINE static Storage prepareStorage(
-      TensorBlockDesc& desc, TensorBlockScratch& scratch,
-      bool allow_strided_storage = false) {
-    // Try to reuse destination as an output block buffer.
-    typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
-
-    if (desc.destination().kind() == DestinationBuffer::kContiguous) {
-      Scalar* buffer = desc.destination().template data<Scalar>();
-      desc.DropDestinationBuffer();
-      return Storage(buffer, desc.dimensions(),
-                     internal::strides<Layout>(desc.dimensions()),
-                     /*materialized_in_output=*/true,
-                     /*strided_storage=*/false);
-
-    } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
-               allow_strided_storage) {
-      Scalar* buffer = desc.destination().template data<Scalar>();
-      desc.DropDestinationBuffer();
-      return Storage(buffer, desc.dimensions(), desc.destination().strides(),
-                     /*materialized_in_output=*/true, /*strided_storage=*/true);
-
-    } else {
-      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
-      return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
-                     internal::strides<Layout>(desc.dimensions()),
-                     /*materialized_in_output=*/false,
-                     /*strided_storage=*/false);
-    }
-  }
-
-  // Creates a materialized block for the given descriptor from a memory buffer.
-  template <typename DataDimensions, typename TensorBlockScratch>
-  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
-      const Scalar* data, const DataDimensions& data_dims,
-      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
-    eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
-
-    // If a tensor block dimensions covers a contiguous block of the underlying
-    // memory, we can skip block buffer memory allocation, and construct a block
-    // from existing `data` memory buffer.
-    //
-    // Example: (RowMajor layout)
-    //   data_dims:          [11, 12, 13, 14]
-    //   desc.dimensions():  [1,   1,  3, 14]
-    //
-    // In this case we can construct a TensorBlock starting at
-    // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
-    static const bool is_col_major = Layout == ColMajor;
-
-    // Find out how many inner dimensions have a matching size.
-    int num_matching_inner_dims = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      int dim = is_col_major ? i : NumDims - i - 1;
-      if (data_dims[dim] != desc.dimensions()[dim]) break;
-      ++num_matching_inner_dims;
-    }
-
-    // All the outer dimensions must be of size `1`, except a single dimension
-    // before the matching inner dimension (`3` in the example above).
-    bool can_use_direct_access = true;
-    for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
-      int dim = is_col_major ? i : NumDims - i - 1;
-      if (desc.dimension(dim) != 1) {
-        can_use_direct_access = false;
-        break;
-      }
-    }
-
-    if (can_use_direct_access) {
-      const Scalar* block_start = data + desc.offset();
-      return TensorMaterializedBlock(internal::TensorBlockKind::kView,
-                                     block_start, desc.dimensions());
-
-    } else {
-      // Reuse destination buffer or allocate new buffer with scratch allocator.
-      const Storage storage = prepareStorage(desc, scratch);
-
-      typedef internal::TensorBlockIOV2<Scalar, IndexType, NumDims, Layout>
-          TensorBlockIO;
-      typedef typename TensorBlockIO::Dst TensorBlockIODst;
-      typedef typename TensorBlockIO::Src TensorBlockIOSrc;
-
-      TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
-                           data, desc.offset());
-      TensorBlockIODst dst(storage.dimensions(), storage.strides(),
-                           storage.data());
-
-      TensorBlockIO::Copy(dst, src);
-      return storage.AsTensorMaterializedBlock();
-    }
-  }
-
- private:
-  TensorBlockKind m_kind;
-  const Scalar* m_data;
-  Dimensions m_dimensions;
-  XprType m_expr;
-  bool m_valid_expr;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
-// functor to the blocks produced by the underlying Tensor expression.
-
-template <typename UnaryOp, typename ArgTensorBlock>
-class TensorCwiseUnaryBlock {
-
-  static const bool NoArgBlockAccess =
-      internal::is_void<typename ArgTensorBlock::XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
-      type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
-      : m_arg_block(arg_block), m_functor(functor) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-
-  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
-  const Scalar* data() const { return NULL; }
-  void cleanup() { m_arg_block.cleanup(); }
-
- private:
-  ArgTensorBlock m_arg_block;
-  UnaryOp m_functor;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
-// functor to the blocks produced by the underlying Tensor expression.
-
-template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
-class TensorCwiseBinaryBlock {
-
-  static const bool NoArgBlockAccess =
-      internal::is_void<typename LhsTensorBlock::XprType>::value ||
-      internal::is_void<typename RhsTensorBlock::XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
-                          const typename RhsTensorBlock::XprType> >::type
-      XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
-                         const RhsTensorBlock& right_block,
-                         const BinaryOp& functor)
-      : m_left_block(left_block),
-        m_right_block(right_block),
-        m_functor(functor) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-
-  XprType expr() const {
-    return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
-  }
-
-  const Scalar* data() const { return NULL; }
-
-  void cleanup() {
-    m_left_block.cleanup();
-    m_right_block.cleanup();
-  }
-
- private:
-  LhsTensorBlock m_left_block;
-  RhsTensorBlock m_right_block;
-  BinaryOp m_functor;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorUnaryExprBlock is a lazy tensor expression block that can construct
-// an arbitrary tensor expression from a block of the underlying type (this is a
-// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
-
-template <typename BlockFactory, typename ArgTensorBlock>
-class TensorUnaryExprBlock {
-
-  typedef typename ArgTensorBlock::XprType ArgXprType;
-  static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
-                       const BlockFactory& factory)
-      : m_arg_block(arg_block), m_factory(factory) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
-  const Scalar* data() const { return NULL; }
-  void cleanup() { m_arg_block.cleanup(); }
-
- private:
-  ArgTensorBlock m_arg_block;
-  BlockFactory m_factory;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorTernaryExprBlock is a lazy tensor expression block that can construct
-// an arbitrary tensor expression from three blocks of the underlying type.
-
-template <typename BlockFactory, typename Arg1TensorBlock,
-          typename Arg2TensorBlock, typename Arg3TensorBlock>
-class TensorTernaryExprBlock {
-
-  typedef typename Arg1TensorBlock::XprType Arg1XprType;
-  typedef typename Arg2TensorBlock::XprType Arg2XprType;
-  typedef typename Arg3TensorBlock::XprType Arg3XprType;
-
-  static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
-                                       internal::is_void<Arg2XprType>::value ||
-                                       internal::is_void<Arg3XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
-                                              Arg3XprType>::type>::type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
-                         const Arg2TensorBlock& arg2_block,
-                         const Arg3TensorBlock& arg3_block,
-                         const BlockFactory& factory)
-      : m_arg1_block(arg1_block),
-        m_arg2_block(arg2_block),
-        m_arg3_block(arg3_block),
-        m_factory(factory) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-  XprType expr() const {
-    return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
-                          m_arg3_block.expr());
-  }
-  const Scalar* data() const { return NULL; }
-  void cleanup() {
-    m_arg1_block.cleanup();
-    m_arg2_block.cleanup();
-    m_arg3_block.cleanup();
-  }
-
- private:
-  Arg1TensorBlock m_arg1_block;
-  Arg2TensorBlock m_arg2_block;
-  Arg3TensorBlock m_arg3_block;
-  BlockFactory m_factory;
-};
-
-// -------------------------------------------------------------------------- //
-// StridedLinearBufferCopy provides a method to copy data between two linear
-// buffers with different strides, with optimized paths for scatter/gather.
-
-template <typename Scalar, typename IndexType>
-class StridedLinearBufferCopy {
-  typedef typename packet_traits<Scalar>::type Packet;
-  enum {
-    Vectorizable = packet_traits<Scalar>::Vectorizable,
-    PacketSize = packet_traits<Scalar>::size
-  };
-
- public:
-  // Specifying linear copy kind statically gives ~30% speedup for small sizes.
-  enum Kind {
-    Linear = 0,       // src_stride == 1 && dst_stride == 1
-    Scatter = 1,      // src_stride == 1 && dst_stride != 1
-    FillLinear = 2,   // src_stride == 0 && dst_stride == 1
-    FillScatter = 3,  // src_stride == 0 && dst_stride != 1
-    Gather = 4,       // dst_stride == 1
-    Random = 5        // everything else
-  };
-
-  struct Dst {
-    Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
-
-    IndexType offset;
-    IndexType stride;
-    Scalar* data;
-  };
-
-  struct Src {
-    Src(IndexType o, IndexType s, const Scalar* d)
-        : offset(o), stride(s), data(d) {}
-
-    IndexType offset;
-    IndexType stride;
-    const Scalar* data;
-  };
-
-  template <StridedLinearBufferCopy::Kind kind>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
-                                                        const Src& src,
-                                                        const size_t count) {
-    Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
-              src.data);
-  }
-
- private:
-  template <StridedLinearBufferCopy::Kind kind>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const IndexType count, const IndexType dst_offset,
-      const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
-      const IndexType src_offset, const IndexType src_stride,
-      const Scalar* EIGEN_RESTRICT src_data) {
-    const Scalar* src = &src_data[src_offset];
-    Scalar* dst = &dst_data[dst_offset];
-
-    if (!Vectorizable) {
-      for (Index i = 0; i < count; ++i) {
-        dst[i * dst_stride] = src[i * src_stride];
-      }
-      return;
-    }
-
-    const IndexType vectorized_size = count - PacketSize;
-    IndexType i = 0;
-
-    if (kind == Linear) {
-      // ******************************************************************** //
-      // Linear copy from `src` to `dst`.
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      eigen_assert(src_stride == 1 && dst_stride == 1);
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          Packet p = ploadu<Packet>(src + i + j * PacketSize);
-          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
-        }
-      }
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = ploadu<Packet>(src + i);
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = src[i];
-      }
-      // ******************************************************************** //
-    } else if (kind == Scatter) {
-      // Scatter from `src` to `dst`.
-      eigen_assert(src_stride == 1 && dst_stride != 1);
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = ploadu<Packet>(src + i);
-        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
-      }
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = src[i];
-      }
-      // ******************************************************************** //
-    } else if (kind == FillLinear) {
-      // Fill `dst` with value at `*src`.
-      eigen_assert(src_stride == 0 && dst_stride == 1);
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      Packet p = pload1<Packet>(src);
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
-        }
-      }
-      for (; i <= vectorized_size; i += PacketSize) {
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = *src;
-      }
-      // ******************************************************************** //
-    } else if (kind == FillScatter) {
-      // Scatter `*src` into `dst`.
-      eigen_assert(src_stride == 0 && dst_stride != 1);
-      Packet p = pload1<Packet>(src);
-      for (; i <= vectorized_size; i += PacketSize) {
-        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
-      }
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = *src;
-      }
-      // ******************************************************************** //
-    } else if (kind == Gather) {
-      // Gather from `src` into `dst`.
-      eigen_assert(dst_stride == 1);
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = src[i * src_stride];
-      }
-      // ******************************************************************** //
-    } else if (kind == Random) {
-      // Random.
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = src[i * src_stride];
-      }
-    } else {
-      eigen_assert(false);
-    }
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
-// It's possible to specify src->dst dimension mapping for the copy operation.
-// Dimensions of `dst` specify how many elements have to be copied, for the
-// `src` we need to know only stride to navigate through source memory buffer.
-
-template <typename Scalar, typename IndexType, int NumDims, int Layout>
-class TensorBlockIOV2 {
-  static const bool IsColMajor = (Layout == ColMajor);
-
-  typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
-
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-  typedef DSizes<int, NumDims> DimensionsMap;
-
-  struct Dst {
-    Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
-        IndexType dst_offset = 0)
-        : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
-
-    Dimensions dims;
-    Dimensions strides;
-    Scalar* data;
-    IndexType offset;
-  };
-
-  struct Src {
-    Src(const Dimensions& src_strides, const Scalar* src,
-        IndexType src_offset = 0)
-        : strides(src_strides), data(src), offset(src_offset) {}
-
-    Dimensions strides;
-    const Scalar* data;
-    IndexType offset;
-  };
-
-  // Copies data to `dst` from `src`, using provided dimensions mapping:
-  //
-  //   src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
-  //
-  // Returns the number of copied elements.
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
-      const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
-    // Copy single scalar value from `src` to `dst`.
-    if (NumDims == 0) {
-      *(dst.data + dst.offset) = *(src.data + src.offset);
-      return 1;
-    }
-
-    // Both `dst` and `src` must have contiguous innermost dimension. We also
-    // accept the special case with stride '0', because it's used as a trick to
-    // implement broadcasting.
-    {
-      int inner_dim = IsColMajor ? 0 : NumDims - 1;
-      EIGEN_UNUSED_VARIABLE(inner_dim);
-      eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
-      eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
-    }
-
-    // Give a shorter name to `dst_to_src_dim_map`.
-    const DimensionsMap& dim_map = dst_to_src_dim_map;
-
-    // Do not squeeze reordered inner dimensions.
-    int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
-
-    // NOTE: We find the innermost dimension (contiguous in memory) in the dst
-    // block, and we write data linearly into that dimension, reading it from
-    // the src. If dimensions are reordered, we might end up reading data from
-    // the src with `stride != 1`.
-    //
-    // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
-    // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
-
-    // Find the innermost dimension in the dst whose size is not 1. This is the
-    // effective inner dim.
-    int num_size_one_inner_dims = 0;
-    for (int i = 0; i < num_squeezable_dims; ++i) {
-      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
-      if (dst.dims[dst_dim] != 1) break;
-      num_size_one_inner_dims++;
-    }
-
-    // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
-    if (num_size_one_inner_dims == NumDims) {
-      *(dst.data + dst.offset) = *(src.data + src.offset);
-      return 1;
-    }
-
-    // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
-    const int dst_stride1_dim = IsColMajor
-                                    ? num_size_one_inner_dims
-                                    : NumDims - num_size_one_inner_dims - 1;
-
-    // Dimension in the src that corresponds to the dst innermost dimension.
-    const int src_dim_for_dst_stride1_dim =
-        NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
-
-    // Size of the innermost dimension (length of contiguous blocks of memory).
-    IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
-
-    // Squeeze multiple inner dims into one if they are contiguous in `dst` and
-    // `src` memory, so we can do less linear copy calls.
-    for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
-      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
-      const IndexType dst_stride = dst.strides[dst_dim];
-      const IndexType src_stride = src.strides[dim_map[dst_dim]];
-      if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
-        dst_inner_dim_size *= dst.dims[dst_dim];
-        ++num_size_one_inner_dims;
-      } else {
-        break;
-      }
-    }
-
-    // Setup strides to read data from `src` and write to `dst`.
-    IndexType input_offset = src.offset;
-    IndexType output_offset = dst.offset;
-    IndexType input_stride =
-        NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
-    IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
-
-    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
-    array<BlockIteratorState, at_least_1_dim> it;
-
-    // Initialize block iterator state. Squeeze away any dimension of size 1.
-    int idx = 0;  // currently initialized iterator state index
-    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
-      const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
-      if (dst.dims[dst_dim] == 1) continue;
-
-      it[idx].size = dst.dims[dst_dim];
-      it[idx].input_stride = src.strides[dim_map[dst_dim]];
-      it[idx].output_stride = dst.strides[dst_dim];
-
-      it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
-      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
-
-      idx++;
-    }
-
-    // Iterate copying data from src to dst.
-    const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
-
-#define COPY_INNER_DIM(KIND)                                           \
-  IndexType num_copied = 0;                                            \
-  for (num_copied = 0; num_copied < block_total_size;                  \
-       num_copied += dst_inner_dim_size) {                             \
-    LinCopy::template Run<KIND>(                                       \
-        typename LinCopy::Dst(output_offset, output_stride, dst.data), \
-        typename LinCopy::Src(input_offset, input_stride, src.data),   \
-        dst_inner_dim_size);                                           \
-                                                                       \
-    for (int j = 0; j < idx; ++j) {                                    \
-      if (++it[j].count < it[j].size) {                                \
-        input_offset += it[j].input_stride;                            \
-        output_offset += it[j].output_stride;                          \
-        break;                                                         \
-      }                                                                \
-      it[j].count = 0;                                                 \
-      input_offset -= it[j].input_span;                                \
-      output_offset -= it[j].output_span;                              \
-    }                                                                  \
-  }                                                                    \
-  return num_copied;
-
-    if (input_stride == 1 && output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::Linear);
-    } else if (input_stride == 1 && output_stride != 1) {
-      COPY_INNER_DIM(LinCopy::Scatter);
-    } else if (input_stride == 0 && output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::FillLinear);
-    } else if (input_stride == 0 && output_stride != 1) {
-      COPY_INNER_DIM(LinCopy::FillScatter);
-    } else if (output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::Gather);
-    } else {
-      COPY_INNER_DIM(LinCopy::Random);
-    }
-
-#undef COPY_INNER_DIM
-  }
-
-  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
-  // the number of copied elements.
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
-                                                              const Src& src) {
-    DimensionsMap dst_to_src_map;
-    for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
-    return Copy(dst, src, dst_to_src_map);
-  }
-
- private:
-  struct BlockIteratorState {
-    BlockIteratorState()
-        : size(0),
-          count(0),
-          input_stride(0),
-          output_stride(0),
-          input_span(0),
-          output_span(0) {}
-
-    IndexType size;
-    IndexType count;
-    IndexType input_stride;
-    IndexType output_stride;
-    IndexType input_span;
-    IndexType output_span;
-  };
-
-  // Compute how many inner dimensions it's allowed to squeeze when doing IO
-  // between two tensor blocks. It's safe to squeeze inner dimensions, only
-  // if they are not reordered.
-  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
-    int num_squeezable_dims = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      if (dim_map[dim] != dim) break;
-      num_squeezable_dims++;
-    }
-    return num_squeezable_dims;
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
-// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
-//
-// Currently there is no way to write from a Tensor expression to a block of
-// memory, if dimensions are reordered. If you need to do that, you should
-// materialize a Tensor block expression into a memory buffer, and then use
-// TensorBlockIO to copy data between two memory buffers with a custom
-// `target->src` dimension map (see definition above).
-//
-// Also currently the innermost dimension of `target` must have a stride '1'
-// (contiguous in memory). This restriction could be lifted with a `pscatter`,
-// but in practice it's never needed, and there is a similar TensorBlockIO
-// workaround for that.
-//
-// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
-// where `src` is a tensor expression. Explore if it is possible to rewrite IO
-// to use expressions instead of pointers, and after that TensorBlockAssignment
-// will become an alias to IO.
-template <typename Scalar, int NumDims, typename TensorBlockExpr,
-          typename IndexType = Eigen::Index>
-class TensorBlockAssignment {
-  // We will use coeff/packet path to evaluate block expressions.
-  typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
-      TensorBlockEvaluator;
-
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  enum {
-    Vectorizable = packet_traits<Scalar>::Vectorizable,
-    PacketSize = packet_traits<Scalar>::size
-  };
-
-  template <bool Vectorizable, typename Evaluator>
-  struct InnerDimAssign {
-    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
-                                        const Evaluator& eval,
-                                        IndexType eval_offset) {
-      for (IndexType i = 0; i < count; ++i) {
-        target[i] = eval.coeff(eval_offset + i);
-      }
-    }
-  };
-
-  template <typename Evaluator>
-  struct InnerDimAssign<true, Evaluator> {
-    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
-                                        const Evaluator& eval,
-                                        IndexType eval_offset) {
-      typedef typename packet_traits<Scalar>::type Packet;
-
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      const IndexType vectorized_size = count - PacketSize;
-      IndexType i = 0;
-
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          const IndexType idx = eval_offset + i + j * PacketSize;
-          Packet p = eval.template packet<Unaligned>(idx);
-          pstoreu<Scalar>(target + i + j * PacketSize, p);
-        }
-      }
-
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = eval.template packet<Unaligned>(eval_offset + i);
-        pstoreu<Scalar>(target + i, p);
-      }
-
-      for (; i < count; ++i) {
-        target[i] = eval.coeff(eval_offset + i);
-      }
-    }
-  };
-
- public:
-  struct Target {
-    Target(const Dimensions& target_dims, const Dimensions& target_strides,
-           Scalar* target_data, IndexType target_offset = 0)
-        : dims(target_dims),
-          strides(target_strides),
-          data(target_data),
-          offset(target_offset) {}
-
-    Dimensions dims;
-    Dimensions strides;
-    Scalar* data;
-    IndexType offset;
-  };
-
-  static Target target(const Dimensions& target_dims,
-                       const Dimensions& target_strides, Scalar* target_data,
-                       IndexType target_offset = 0) {
-    return Target(target_dims, target_strides, target_data, target_offset);
-  }
-
-  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
-  static Target target(
-      const DSizes<TargetDimsIndexType, NumDims>& target_dims,
-      const DSizes<TargetStridesIndexType, NumDims>& target_strides,
-      Scalar* target_data, IndexType target_offset = 0) {
-    // DSizes constructor will do index type promotion if it's safe.
-    return Target(Dimensions(target_dims), Dimensions(target_strides),
-                  target_data, target_offset);
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Target& target, const TensorBlockExpr& expr) {
-    // Prepare evaluator for block expression.
-    DefaultDevice default_device;
-    TensorBlockEvaluator eval(expr, default_device);
-
-    // Tensor block expression dimension should match destination dimensions.
-    eigen_assert(dimensions_match(target.dims, eval.dimensions()));
-
-    static const int Layout = TensorBlockEvaluator::Layout;
-    static const bool is_col_major = Layout == ColMajor;
-
-    // Initialize output inner dimension size based on a layout.
-    const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
-    const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
-    IndexType output_inner_dim_size = target.dims[inner_dim_idx];
-
-    // Target inner dimension stride must be '1'.
-    eigen_assert(target.strides[inner_dim_idx] == 1);
-
-    // Squeeze multiple inner dims into one if they are contiguous in `target`.
-    IndexType num_squeezed_dims = 0;
-    for (Index i = 1; i < NumDims; ++i) {
-      const Index dim = is_col_major ? i : NumDims - i - 1;
-      const IndexType target_stride = target.strides[dim];
-
-      if (output_inner_dim_size == target_stride) {
-        output_inner_dim_size *= target.dims[dim];
-        num_squeezed_dims++;
-      } else {
-        break;
-      }
-    }
-
-    // Initialize output block iterator state. Dimension in this array are
-    // always in inner_most -> outer_most order (col major layout).
-    array<BlockIteratorState, NumDims> it;
-
-    int idx = 0;  // currently initialized iterator state index
-    for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
-      const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
-
-      it[idx].count = 0;
-      it[idx].size = target.dims[dim];
-      it[idx].output_stride = target.strides[dim];
-      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
-      idx++;
-    }
-
-    // We read block expression from the beginning, and start writing data to
-    // `target` at given offset.
-    IndexType input_offset = 0;
-    IndexType output_offset = target.offset;
-
-    // Iterate copying data from `eval` to `target`.
-    for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
-      // Assign to `target` at current offset.
-      InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
-                     TensorBlockEvaluator>::Run(target.data + output_offset,
-                                                output_inner_dim_size, eval,
-                                                input_offset);
-
-      // Move input offset forward by the number of assigned coefficients.
-      input_offset += output_inner_dim_size;
-
-      // Update index.
-      for (int j = 0; j < idx; ++j) {
-        if (++it[j].count < it[j].size) {
-          output_offset += it[j].output_stride;
-          break;
-        }
-        it[j].count = 0;
-        output_offset -= it[j].output_span;
-      }
-    }
-  }
-
- private:
-  struct BlockIteratorState {
-    BlockIteratorState()
-        : count(0), size(0), output_stride(0), output_span(0) {}
-
-    IndexType count;
-    IndexType size;
-    IndexType output_stride;
-    IndexType output_span;
-  };
-};
-
-// -------------------------------------------------------------------------- //
-
-}  // namespace internal
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 454b0f752..620c8741c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -114,7 +114,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   enum {
     IsAligned         = true,
     PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2     = TensorEvaluator<ArgType, Device>::BlockAccessV2,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
     PreferBlockAccess = true,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess         = false
@@ -130,12 +130,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
   typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
 
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
       ArgTensorBlock;
 
   typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
                                                      Layout, Index>
-      TensorBlockV2;
+      TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
@@ -617,19 +617,19 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockV2ResourceRequirements getResourceRequirements() const {
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
     // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
     // tensors. But this might need further tuning.
     const size_t target_block_size = numext::maxi<size_t>(
         1, m_device.firstLevelCacheSize() / sizeof(Scalar));
 
-    return internal::TensorBlockV2ResourceRequirements::merge(
-        {internal::TensorBlockV2ShapeType::kSkewedInnerDims, target_block_size},
+    return internal::TensorBlockResourceRequirements::merge(
+        {internal::TensorBlockShapeType::kSkewedInnerDims, target_block_size},
         m_impl.getResourceRequirements());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
           bool /*root_of_expr_ast*/ = false) const {
     BlockBroadcastingParams params = blockBroadcastingParams(desc);
 
@@ -638,8 +638,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     }
 
     // Prepare storage for the materialized broadcasting result.
-    const typename TensorBlockV2::Storage block_storage =
-        TensorBlockV2::prepareStorage(desc, scratch);
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch);
     ScalarNoConst* materialized_output = block_storage.data();
 
     // We potentially will need to materialize input blocks.
@@ -843,10 +843,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     return params;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 emptyBlock() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const {
     DSizes<Index, NumDims> dimensions;
     for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
-    return TensorBlockV2(internal::TensorBlockKind::kView, NULL, dimensions);
+    return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim(
@@ -856,7 +856,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       size_t* materialized_input_size) const {
     if (params.bcast_dim_size == 1) {
       // We just need one block read using the ready-set values above.
-      return BroadcastBlockV2(
+      return BroadcastBlock(
           params.input_block_sizes, params.input_block_strides,
           params.bcast_block_sizes, params.bcast_block_strides,
           params.bcast_input_strides, bcast_offset, 0, scratch,
@@ -873,7 +873,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       params.bcast_block_strides[broadcast_bcast_dim] =
           params.output_strides[params.bcast_dim];
 
-      return BroadcastBlockV2(
+      return BroadcastBlock(
           params.input_block_sizes, params.input_block_strides,
           params.bcast_block_sizes, params.bcast_block_strides,
           params.bcast_input_strides, bcast_offset, 0, scratch,
@@ -942,7 +942,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
               params.output_strides[params.bcast_dim] *
               params.input_dims[params.bcast_dim];
 
-          num_output_coeffs += BroadcastBlockV2(
+          num_output_coeffs += BroadcastBlock(
               params.input_block_sizes, params.input_block_strides,
               params.bcast_block_sizes, params.bcast_block_strides,
               params.bcast_input_strides, bcast_offset, 0, scratch,
@@ -964,7 +964,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
           const Index offset = (first_multiple - bcast_dim_left_index) *
                                m_outputStrides[params.bcast_dim];
 
-          num_output_coeffs += BroadcastBlockV2(
+          num_output_coeffs += BroadcastBlock(
               params.input_block_sizes, params.input_block_strides,
               params.bcast_block_sizes, params.bcast_block_strides,
               params.bcast_input_strides, bcast_offset, offset, scratch,
@@ -987,7 +987,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
           const Index offset = (last_multiple - bcast_dim_left_index) *
                                m_outputStrides[params.bcast_dim];
 
-          num_output_coeffs += BroadcastBlockV2(
+          num_output_coeffs += BroadcastBlock(
               params.input_block_sizes, params.input_block_strides,
               params.bcast_block_sizes, params.bcast_block_strides,
               params.bcast_input_strides, bcast_offset, offset, scratch,
@@ -1005,7 +1005,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
         params.bcast_block_strides[copy_bcast_dim] =
             params.output_strides[params.bcast_dim];
 
-        num_output_coeffs += BroadcastBlockV2(
+        num_output_coeffs += BroadcastBlock(
             params.input_block_sizes, params.input_block_strides,
             params.bcast_block_sizes, params.bcast_block_strides,
             params.bcast_input_strides, bcast_offset, 0, scratch,
@@ -1016,7 +1016,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockV2(
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock(
       const Dimensions& input_block_sizes,
       const Dimensions& input_block_strides,
       const BroadcastDimensions& bcast_block_sizes,
@@ -1032,7 +1032,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
         IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset),
         input_block_sizes);
 
-    ArgTensorBlock input_block = m_impl.blockV2(input_desc, scratch);
+    ArgTensorBlock input_block = m_impl.block(input_desc, scratch);
 
     // ---------------------------------------------------------------------- //
     // Materialize input block into a temporary memory buffer only if it's not
@@ -1071,14 +1071,14 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     // ---------------------------------------------------------------------- //
     // Copy data from materialized input block to the materialized output, using
     // given broadcast strides (strides with zeroes).
-    typedef internal::TensorBlockIOV2<ScalarNoConst, Index, 2 * NumDims, Layout>
-        TensorBlockIOV2;
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, 2 * NumDims, Layout>
+        TensorBlockIO;
 
-    typename TensorBlockIOV2::Src src(bcast_input_strides, input_buffer);
-    typename TensorBlockIOV2::Dst dst(bcast_block_sizes, bcast_block_strides,
+    typename TensorBlockIO::Src src(bcast_input_strides, input_buffer);
+    typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides,
                                       materialized_output + offset);
 
-    return TensorBlockIOV2::Copy(dst, src);
+    return TensorBlockIO::Copy(dst, src);
   }
 
 protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 268c3246a..f51a8559d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -148,7 +148,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     IsAligned         = false,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2     = TensorEvaluator<ArgType, Device>::BlockAccessV2,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
     // Chipping of outer-most dimension is a trivial operation, because we can
     // read and write directly from the underlying tensor using single offset.
     IsOuterChipping   = (static_cast<int>(Layout) == ColMajor && DimId == NumInputDims - 1) ||
@@ -172,12 +172,12 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
 
   typedef internal::TensorBlockDescriptor<NumInputDims, Index>
       ArgTensorBlockDesc;
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
       ArgTensorBlock;
 
   typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
                                                      Layout, Index>
-      TensorBlockV2;
+      TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -295,17 +295,17 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockV2ResourceRequirements getResourceRequirements() const {
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
     const size_t target_block_size =
         numext::maxi<size_t>(1, m_device.lastLevelCacheSize() / sizeof(Scalar));
 
-    return internal::TensorBlockV2ResourceRequirements::merge(
-        {internal::TensorBlockV2ShapeType::kSkewedInnerDims, target_block_size},
+    return internal::TensorBlockResourceRequirements::merge(
+        {internal::TensorBlockShapeType::kSkewedInnerDims, target_block_size},
         m_impl.getResourceRequirements());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
           bool root_of_expr_ast = false) const {
     const Index chip_dim = m_dim.actualDim();
 
@@ -334,20 +334,20 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
           arg_destination_strides);
     }
 
-    ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch, root_of_expr_ast);
+    ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast);
     if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
 
     if (arg_block.data() != NULL) {
       // Forward argument block buffer if possible.
-      return TensorBlockV2(arg_block.kind(), arg_block.data(),
+      return TensorBlock(arg_block.kind(), arg_block.data(),
                            desc.dimensions());
 
     } else {
       // Assign argument block expression to a buffer.
 
       // Prepare storage for the materialized chipping result.
-      const typename TensorBlockV2::Storage block_storage =
-          TensorBlockV2::prepareStorage(desc, scratch);
+      const typename TensorBlock::Storage block_storage =
+          TensorBlock::prepareStorage(desc, scratch);
 
       typedef internal::TensorBlockAssignment<
           ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index>
@@ -442,7 +442,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   enum {
     IsAligned     = false,
     PacketAccess  = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
+    BlockAccess   = TensorEvaluator<ArgType, Device>::RawAccess,
     Layout        = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess     = false
   };
@@ -499,9 +499,9 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
     }
   }
 
-  template <typename TensorBlockV2>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
-      const TensorBlockDesc& desc, const TensorBlockV2& block) {
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlockDesc& desc, const TensorBlock& block) {
     assert(this->m_impl.data() != NULL);
 
     const Index chip_dim = this->m_dim.actualDim();
@@ -514,7 +514,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
     }
 
     typedef TensorReshapingOp<const DSizes<Index, NumInputDims>,
-                              const typename TensorBlockV2::XprType>
+                              const typename TensorBlock::XprType>
         TensorBlockExpr;
 
     typedef internal::TensorBlockAssignment<Scalar, NumInputDims,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index aad9d86be..5968ff4b7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -125,7 +125,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
     IsAligned         = false,
     PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
                         TensorEvaluator<RightArgType, Device>::PacketAccess,
-    BlockAccessV2     = false,
+    BlockAccess       = false,
     PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
                         TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
     Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
@@ -133,7 +133,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -324,7 +324,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
     IsAligned         = false,
     PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
                         TensorEvaluator<RightArgType, Device>::PacketAccess,
-    BlockAccessV2     = false,
+    BlockAccess       = false,
     PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
                         TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
     Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
@@ -332,7 +332,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 4bc7b3942..605d72c8d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -381,7 +381,7 @@ struct TensorContractionEvaluatorBase
   enum {
     IsAligned         = true,
     PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccessV2     = false,
+    BlockAccess       = false,
     PreferBlockAccess = false,
     Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
     CoordAccess       = false,  // to be implemented
@@ -389,7 +389,7 @@ struct TensorContractionEvaluatorBase
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   // Most of the code is assuming that both input tensors are ColMajor. If the
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index f9f90ec02..cdbafbbb1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -302,7 +302,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
                         TensorEvaluator<ArgType, Device>::PacketAccess &
                         internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
     #endif
-    BlockAccessV2     = TensorEvaluator<ArgType, Device>::BlockAccessV2,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
     PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess         = false
@@ -314,7 +314,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
   typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
   typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
 
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
       ArgTensorBlock;
 
   struct TensorConversionOpBlockFactory {
@@ -331,7 +331,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
 
   typedef internal::TensorUnaryExprBlock<TensorConversionOpBlockFactory,
                                          ArgTensorBlock>
-      TensorBlockV2;
+      TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -398,14 +398,14 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockV2ResourceRequirements getResourceRequirements() const {
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
     return m_impl.getResourceRequirements();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
           bool /*root_of_expr_ast*/ = false) const {
-    return TensorBlockV2(m_impl.blockV2(desc, scratch),
+    return TensorBlock(m_impl.block(desc, scratch),
                          TensorConversionOpBlockFactory());
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 44068fedc..27ad9f147 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -309,7 +309,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   enum {
     IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
@@ -317,7 +317,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -786,7 +786,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   enum {
     IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
     PacketAccess = false,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
     CoordAccess = false,  // to be implemented
@@ -794,7 +794,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index 0218727d1..92003c766 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -294,7 +294,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
                 TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
     PacketAccess = false,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
     CoordAccess = false,  // to be implemented
@@ -302,7 +302,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index 242533f72..476b2282a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -95,7 +95,7 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
   enum {
     IsAligned = false,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = false,
     Layout = TensorEvaluator<XprType, Device>::Layout,
     CoordAccess = false,  // to be implemented
@@ -103,7 +103,7 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
@@ -268,7 +268,7 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
   enum {
     IsAligned = false,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = false,
     Layout = TensorEvaluator<LhsXprType, Device>::Layout,
     CoordAccess = false,  // to be implemented
@@ -276,7 +276,7 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 2cbc4e878..4689b0230 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -110,7 +110,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
   enum {
     IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2     = true,
+    BlockAccess       = true,
     PreferBlockAccess = false,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess       = false,  // to be implemented
@@ -123,7 +123,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
   typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
   typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
 
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
       ArgTensorBlock;
 
   typedef internal::TensorBlockAssignment<
@@ -165,11 +165,11 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockV2ResourceRequirements getResourceRequirements() const {
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
     return m_impl.getResourceRequirements();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2(
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(
       TensorBlockDesc& desc, TensorBlockScratch& scratch) {
     // Add `m_buffer` as destination buffer to the block descriptor.
     desc.template AddDestinationBuffer<Layout>(
@@ -177,7 +177,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
         /*dst_strides=*/internal::strides<Layout>(m_impl.dimensions()));
 
     ArgTensorBlock block =
-        m_impl.blockV2(desc, scratch, /*root_of_expr_ast=*/true);
+        m_impl.block(desc, scratch, /*root_of_expr_ast=*/true);
 
     // If block was evaluated into a destination buffer, there is no need to do
     // an assignment.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 613a8347d..146cc325e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -45,7 +45,7 @@ struct TensorEvaluator
   enum {
     IsAligned          = Derived::IsAligned,
     PacketAccess       = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccessV2      = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
+    BlockAccess        = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
     PreferBlockAccess  = false,
     Layout             = Derived::Layout,
     CoordAccess        = NumCoords > 0,
@@ -60,7 +60,7 @@ struct TensorEvaluator
 
   typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
                                                      Layout, Index>
-      TensorBlockV2;
+      TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
@@ -150,23 +150,23 @@ struct TensorEvaluator
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockV2ResourceRequirements getResourceRequirements() const {
-    return internal::TensorBlockV2ResourceRequirements::any();
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
           bool /*root_of_expr_ast*/ = false) const {
     assert(m_data != NULL);
-    return TensorBlockV2::materialize(m_data, m_dims, desc, scratch);
+    return TensorBlock::materialize(m_data, m_dims, desc, scratch);
   }
 
-  template<typename TensorBlockV2>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
-      const TensorBlockDesc& desc, const TensorBlockV2& block) {
+  template<typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlockDesc& desc, const TensorBlock& block) {
     assert(m_data != NULL);
 
-    typedef typename TensorBlockV2::XprType TensorBlockExpr;
+    typedef typename TensorBlock::XprType TensorBlockExpr;
     typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr,
                                             Index>
         TensorBlockAssign;
@@ -246,7 +246,7 @@ struct TensorEvaluator<const Derived, Device>
   enum {
     IsAligned         = Derived::IsAligned,
     PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccessV2     = internal::is_arithmetic<ScalarNoConst>::value,
+    BlockAccess       = internal::is_arithmetic<ScalarNoConst>::value,
     PreferBlockAccess = false,
     Layout            = Derived::Layout,
     CoordAccess       = NumCoords > 0,
@@ -259,7 +259,7 @@ struct TensorEvaluator<const Derived, Device>
 
   typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
                                                      Layout, Index>
-      TensorBlockV2;
+      TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
@@ -323,15 +323,15 @@ struct TensorEvaluator<const Derived, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockV2ResourceRequirements getResourceRequirements() const {
-    return internal::TensorBlockV2ResourceRequirements::any();
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
           bool /*root_of_expr_ast*/ = false) const {
     assert(m_data != NULL);
-    return TensorBlockV2::materialize(m_data, m_dims, desc, scratch);
+    return TensorBlock::materialize(m_data, m_dims, desc, scratch);
   }
 
   EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
@@ -378,7 +378,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
     &&  (PacketType<CoeffReturnType, Device>::size >1)
     #endif
     ,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
@@ -386,7 +386,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
@@ -448,7 +448,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
     IsAligned          = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess       = TensorEvaluator<ArgType, Device>::PacketAccess &
                          internal::functor_traits<UnaryOp>::PacketAccess,
-    BlockAccessV2      = TensorEvaluator<ArgType, Device>::BlockAccessV2,
+    BlockAccess        = TensorEvaluator<ArgType, Device>::BlockAccess,
     PreferBlockAccess  = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout             = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess        = false,  // to be implemented
@@ -476,11 +476,11 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
   typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
   typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
 
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
       ArgTensorBlock;
 
   typedef internal::TensorCwiseUnaryBlock<UnaryOp, ArgTensorBlock>
-      TensorBlockV2;
+      TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
@@ -520,14 +520,14 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockV2ResourceRequirements getResourceRequirements() const {
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
     return m_argImpl.getResourceRequirements();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
           bool /*root_of_expr_ast*/ = false) const {
-    return TensorBlockV2(m_argImpl.blockV2(desc, scratch), m_functor);
+    return TensorBlock(m_argImpl.block(desc, scratch), m_functor);
   }
 
   EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
@@ -560,8 +560,8 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
     PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &
                         TensorEvaluator<RightArgType, Device>::PacketAccess &
                         internal::functor_traits<BinaryOp>::PacketAccess,
-    BlockAccessV2     = TensorEvaluator<LeftArgType, Device>::BlockAccessV2 &
-                        TensorEvaluator<RightArgType, Device>::BlockAccessV2,
+    BlockAccess       = TensorEvaluator<LeftArgType, Device>::BlockAccess &
+                        TensorEvaluator<RightArgType, Device>::BlockAccess,
     PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
                         TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
     Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
@@ -595,14 +595,14 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
   typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
 
-  typedef typename TensorEvaluator<const LeftArgType, Device>::TensorBlockV2
+  typedef typename TensorEvaluator<const LeftArgType, Device>::TensorBlock
       LeftTensorBlock;
-  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlockV2
+  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock
       RightTensorBlock;
 
   typedef internal::TensorCwiseBinaryBlock<BinaryOp, LeftTensorBlock,
                                            RightTensorBlock>
-      TensorBlockV2;
+      TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@@ -653,18 +653,18 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockV2ResourceRequirements getResourceRequirements() const {
-    return internal::TensorBlockV2ResourceRequirements::merge(
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::merge(
         m_leftImpl.getResourceRequirements(),
         m_rightImpl.getResourceRequirements());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
           bool /*root_of_expr_ast*/ = false) const {
     desc.DropDestinationBuffer();
-    return TensorBlockV2(m_leftImpl.blockV2(desc, scratch),
-                         m_rightImpl.blockV2(desc, scratch), m_functor);
+    return TensorBlock(m_leftImpl.block(desc, scratch),
+                         m_rightImpl.block(desc, scratch), m_functor);
   }
 
   EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
@@ -696,7 +696,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
                         TensorEvaluator<Arg2Type, Device>::PacketAccess &&
                         TensorEvaluator<Arg3Type, Device>::PacketAccess &&
                         internal::functor_traits<TernaryOp>::PacketAccess,
-    BlockAccessV2     = false,
+    BlockAccess       = false,
     PreferBlockAccess = TensorEvaluator<Arg1Type, Device>::PreferBlockAccess ||
                         TensorEvaluator<Arg2Type, Device>::PreferBlockAccess ||
                         TensorEvaluator<Arg3Type, Device>::PreferBlockAccess,
@@ -739,7 +739,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
   typedef typename Storage::Type EvaluatorPointerType;
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@@ -814,9 +814,9 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
     PacketAccess      = TensorEvaluator<ThenArgType, Device>::PacketAccess &
                         TensorEvaluator<ElseArgType, Device>::PacketAccess &
                         PacketType<Scalar, Device>::HasBlend,
-    BlockAccessV2     = TensorEvaluator<IfArgType, Device>::BlockAccessV2 &&
-                        TensorEvaluator<ThenArgType, Device>::BlockAccessV2 &&
-                        TensorEvaluator<ElseArgType, Device>::BlockAccessV2,
+    BlockAccess       = TensorEvaluator<IfArgType, Device>::BlockAccess &&
+                        TensorEvaluator<ThenArgType, Device>::BlockAccess &&
+                        TensorEvaluator<ElseArgType, Device>::BlockAccess,
     PreferBlockAccess = TensorEvaluator<IfArgType, Device>::PreferBlockAccess ||
                         TensorEvaluator<ThenArgType, Device>::PreferBlockAccess ||
                         TensorEvaluator<ElseArgType, Device>::PreferBlockAccess,
@@ -850,11 +850,11 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
     typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
   typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
 
-  typedef typename TensorEvaluator<const IfArgType, Device>::TensorBlockV2
+  typedef typename TensorEvaluator<const IfArgType, Device>::TensorBlock
       IfArgTensorBlock;
-  typedef typename TensorEvaluator<const ThenArgType, Device>::TensorBlockV2
+  typedef typename TensorEvaluator<const ThenArgType, Device>::TensorBlock
       ThenArgTensorBlock;
-  typedef typename TensorEvaluator<const ElseArgType, Device>::TensorBlockV2
+  typedef typename TensorEvaluator<const ElseArgType, Device>::TensorBlock
       ElseArgTensorBlock;
 
   struct TensorSelectOpBlockFactory {
@@ -873,7 +873,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   typedef internal::TensorTernaryExprBlock<TensorSelectOpBlockFactory,
                                            IfArgTensorBlock, ThenArgTensorBlock,
                                            ElseArgTensorBlock>
-      TensorBlockV2;
+      TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@@ -933,24 +933,24 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockV2ResourceRequirements getResourceRequirements() const {
-    return internal::TensorBlockV2ResourceRequirements::merge(
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::merge(
         m_condImpl.getResourceRequirements(),
-        internal::TensorBlockV2ResourceRequirements::merge(
+        internal::TensorBlockResourceRequirements::merge(
             m_thenImpl.getResourceRequirements(),
             m_elseImpl.getResourceRequirements()));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
           bool /*root_of_expr_ast*/ = false) const {
     // It's unsafe to pass destination buffer to underlying expressions, because
     // output might be aliased with one of the inputs.
     desc.DropDestinationBuffer();
 
-    return TensorBlockV2(
-        m_condImpl.blockV2(desc, scratch), m_thenImpl.blockV2(desc, scratch),
-        m_elseImpl.blockV2(desc, scratch), TensorSelectOpBlockFactory());
+    return TensorBlock(
+        m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch),
+        m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory());
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 7b7b670ed..b2327da1e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -172,7 +172,7 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
                          const DefaultDevice& device = DefaultDevice()) {
-    typedef TensorBlockV2Mapper<NumDims, Evaluator::Layout, StorageIndex>
+    typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
         TensorBlockMapper;
 
     typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
@@ -187,7 +187,7 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
 
     if (needs_assign) {
       // Query expression tree for desired block size/shape.
-      const TensorBlockV2ResourceRequirements requirements =
+      const TensorBlockResourceRequirements requirements =
           evaluator.getResourceRequirements();
 
       const TensorBlockMapper block_mapper(
@@ -200,7 +200,7 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
       const StorageIndex total_block_count = block_mapper.blockCount();
       for (StorageIndex i = 0; i < total_block_count; ++i) {
         TensorBlockDesc desc = block_mapper.blockDescriptor(i);
-        evaluator.evalBlockV2(desc, scratch);
+        evaluator.evalBlock(desc, scratch);
         scratch.reset();
       }
     }
@@ -257,7 +257,7 @@ TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
     const ThreadPoolDevice& device, const Evaluator& evaluator,
     bool allocate_buffer = true) {
   // Query expression tree for desired block size/shape.
-  const TensorBlockV2ResourceRequirements requirements =
+  const TensorBlockResourceRequirements requirements =
       evaluator.getResourceRequirements();
 
   int num_threads = device.numThreads();
@@ -377,7 +377,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
   static const int NumDims = traits<Expression>::NumDimensions;
 
   typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-  typedef TensorBlockV2Mapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
+  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
   typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
 
   typedef internal::TensorBlockDescriptor<NumDims, IndexType>
@@ -402,7 +402,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
 
         for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
           TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
-          evaluator.evalBlockV2(desc, scratch);
+          evaluator.evalBlock(desc, scratch);
           scratch.reset();
         }
       };
@@ -478,7 +478,7 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
   static const int NumDims = traits<Expression>::NumDimensions;
 
   typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-  typedef TensorBlockV2Mapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
+  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
   typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
 
   typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
@@ -510,7 +510,7 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
              ++block_idx) {
           TensorBlockDesc desc =
               ctx->tiling.block_mapper.blockDescriptor(block_idx);
-          ctx->evaluator.evalBlockV2(desc, scratch);
+          ctx->evaluator.evalBlock(desc, scratch);
           scratch.reset();
         }
       };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index a8841bc38..c62bc5fa9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -133,7 +133,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
   enum {
     IsAligned = false,
     PacketAccess = true,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
@@ -141,7 +141,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index ea3ea2c91..a5be54bcd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -41,7 +41,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
     enum {
       IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
       PacketAccess = (internal::packet_traits<Scalar>::size > 1),
-      BlockAccessV2 = false,
+      BlockAccess = false,
       PreferBlockAccess = false,
       Layout = Options_ & RowMajor ? RowMajor : ColMajor,
       CoordAccess = true,
@@ -49,7 +49,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
     };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   typedef Dimensions_ Dimensions;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 95fa4f509..701eb7128 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -96,7 +96,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
   enum {
     IsAligned         = true,
     PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccessV2     = internal::is_arithmetic<CoeffReturnType>::value,
+    BlockAccess       = internal::is_arithmetic<CoeffReturnType>::value,
     PreferBlockAccess = false,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess         = true
@@ -110,7 +110,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
 
   typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
                                                      Layout, Index>
-      TensorBlockV2;
+      TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
@@ -177,15 +177,15 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockV2ResourceRequirements getResourceRequirements() const {
-    return internal::TensorBlockV2ResourceRequirements::any();
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
           bool /*root_of_expr_ast*/ = false) const {
     assert(m_buffer != NULL);
-    return TensorBlockV2::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
+    return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index b115e502b..246ebe44e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -157,7 +157,7 @@ struct IsVectorizable<GpuDevice, Expression> {
 // Tiled evaluation strategy.
 enum TiledEvaluation {
   Off = 0,    // tiled evaluation is not supported
-  On = 1,     // still work in progress (see TensorBlockV2.h)
+  On = 1,     // still work in progress (see TensorBlock.h)
 };
 
 template <typename Device, typename Expression>
@@ -165,12 +165,12 @@ struct IsTileable {
   // Check that block evaluation is supported and it's a preferred option (at
   // least one sub-expression has much faster block evaluation, e.g.
   // broadcasting).
-  static const bool BlockAccessV2 =
-      TensorEvaluator<Expression, Device>::BlockAccessV2 &&
+  static const bool BlockAccess =
+      TensorEvaluator<Expression, Device>::BlockAccess &&
       TensorEvaluator<Expression, Device>::PreferBlockAccess;
 
   static const TiledEvaluation value =
-      BlockAccessV2 ? TiledEvaluation::On : TiledEvaluation::Off;
+      BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off;
 };
 
 template <typename Expression, typename Device,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index 2bbbbcf37..fb4b5e246 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -93,7 +93,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
   enum {
     IsAligned         = false,
     PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccessV2     = true,
+    BlockAccess       = true,
     PreferBlockAccess = true,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess       = false,  // to be implemented
@@ -108,7 +108,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
 
   typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
                                                      Layout, Index>
-      TensorBlockV2;
+      TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -165,10 +165,10 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockV2ResourceRequirements getResourceRequirements() const {
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
     const size_t target_block_size = numext::maxi<size_t>(
         1, m_device.firstLevelCacheSize() / sizeof(Scalar));
-    return {internal::TensorBlockV2ShapeType::kSkewedInnerDims,
+    return {internal::TensorBlockShapeType::kSkewedInnerDims,
             target_block_size};
   }
 
@@ -179,8 +179,8 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
     Index count;
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
           bool /*root_of_expr_ast*/ = false) const {
     static const bool is_col_major =
         static_cast<int>(Layout) == static_cast<int>(ColMajor);
@@ -206,8 +206,8 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
     eigen_assert(it[0].stride == 1);
 
     // Prepare storage for the materialized generator result.
-    const typename TensorBlockV2::Storage block_storage =
-        TensorBlockV2::prepareStorage(desc, scratch);
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch);
 
     CoeffReturnType* block_buffer = block_storage.data();
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 5010d5c95..49d1004f3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -231,7 +231,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   enum {
     IsAligned         = false,
     PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2     = false,
+    BlockAccess       = false,
     PreferBlockAccess = true,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess       = false,
@@ -239,7 +239,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index ef6b62620..7dadec7fb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -92,7 +92,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
@@ -100,7 +100,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index 695726e10..05fa80e59 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -119,7 +119,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false,  // to be implemented
@@ -127,7 +127,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -198,14 +198,14 @@ template<typename ArgType, typename Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false  // to be implemented
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 7697add4b..5c2036626 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -138,7 +138,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     // For trivial reshapes with raw access to underlying data we will provide
     // zero overhead block access.
     // TODO(ezhulenev): Consider adding block access without raw access?
-    BlockAccessV2     = TensorEvaluator<ArgType, Device>::RawAccess &&
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess &&
                         NumInputDims > 0 && NumOutputDims > 0,
     PreferBlockAccess = false,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
@@ -155,7 +155,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
   typedef
       typename internal::TensorMaterializedBlock<ScalarNoConst, NumOutputDims,
                                                  Layout, Index>
-          TensorBlockV2;
+          TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -199,8 +199,8 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockV2ResourceRequirements getResourceRequirements() const {
-    return internal::TensorBlockV2ResourceRequirements::any();
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
   }
 
   // required in block(OutputTensorBlock* output_block) const
@@ -212,8 +212,8 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     Index count;
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
           bool /*root_of_expr_ast*/ = false) const {
     eigen_assert(m_impl.data() != NULL);
     eigen_assert((kind == Runtime) ||
@@ -223,12 +223,12 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     if (kind == OneByN || kind == NByOne) {
       // We can guarantee at compile time that block is just a contiguous slice
       // of the underlying expression memory buffer.
-      return TensorBlockV2(internal::TensorBlockKind::kView,
+      return TensorBlock(internal::TensorBlockKind::kView,
                            m_impl.data() + desc.offset(), desc.dimensions());
     } else {
       // This will do additional runtime checks, and in the end it might be also
       // a view, or it might be a block materialized in the temporary buffer.
-      return TensorBlockV2::materialize(m_impl.data(), m_dimensions, desc,
+      return TensorBlock::materialize(m_impl.data(), m_dimensions, desc,
                                         scratch);
     }
   }
@@ -264,7 +264,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
   enum {
     IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2     = TensorEvaluator<ArgType, Device>::RawAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
     PreferBlockAccess = false,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess       = false,  // to be implemented
@@ -297,7 +297,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
   }
 
   template <typename TensorBlock>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
       const TensorBlockDesc& desc, const TensorBlock& block) {
     assert(this->m_impl.data() != NULL);
 
@@ -456,7 +456,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     // slice offsets and sizes.
     IsAligned         = false,
     PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2     = TensorEvaluator<ArgType, Device>::BlockAccessV2,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
     PreferBlockAccess = true,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess       = false,
@@ -470,8 +470,8 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
 
   // Tensor slicing does not change the block type.
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
-      TensorBlockV2;
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -547,7 +547,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
         }
       }
       // Use memcpy if it's going to be faster than using the regular evaluation.
-      const MemcpyTriggerForSlicing<Index, Device, BlockAccessV2> trigger(m_device);
+      const MemcpyTriggerForSlicing<Index, Device, BlockAccess> trigger(m_device);
       if (trigger(internal::array_prod(dimensions()), contiguous_values)) {
         EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data();
         for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
@@ -633,19 +633,19 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockV2ResourceRequirements getResourceRequirements() const {
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
     const size_t target_block_size =
         numext::maxi<size_t>(1, m_device.lastLevelCacheSize() / sizeof(Scalar));
-    return internal::TensorBlockV2ResourceRequirements::merge(
-        {internal::TensorBlockV2ShapeType::kSkewedInnerDims, target_block_size},
+    return internal::TensorBlockResourceRequirements::merge(
+        {internal::TensorBlockShapeType::kSkewedInnerDims, target_block_size},
         m_impl.getResourceRequirements());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
           bool /*root_of_expr_ast*/ = false) const {
     TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset()));
-    TensorBlockV2 block = m_impl.blockV2(arg_desc, scratch);
+    TensorBlock block = m_impl.block(arg_desc, scratch);
     if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
     return block;
   }
@@ -745,7 +745,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
   enum {
     IsAligned         = false,
     PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2     = TensorEvaluator<ArgType, Device>::BlockAccessV2,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
     PreferBlockAccess = true,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess       = false,
@@ -823,11 +823,11 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
     }
   }
 
-  template<typename TensorBlockV2>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
-      const TensorBlockDesc& desc, const TensorBlockV2& block) {
+  template<typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlockDesc& desc, const TensorBlock& block) {
     TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset()));
-    this->m_impl.writeBlockV2(arg_desc, block);
+    this->m_impl.writeBlock(arg_desc, block);
   }
 };
 
@@ -935,14 +935,14 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
     // slice offsets and sizes.
     IsAligned = false,
     PacketAccess = false,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = false
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -1116,7 +1116,7 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
   enum {
     IsAligned = false,
     PacketAccess = false,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
@@ -1124,7 +1124,7 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 0d7444730..5530d5dec 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -98,7 +98,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   enum {
     IsAligned         = true,
     PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2     = TensorEvaluator<ArgType, Device>::RawAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
     PreferBlockAccess = true,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess       = true,
@@ -113,7 +113,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
   typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
                                                      Layout, Index>
-      TensorBlockV2;
+      TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -228,20 +228,20 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockV2ResourceRequirements getResourceRequirements() const {
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
     const size_t target_block_size =
         numext::maxi<size_t>(1, m_device.lastLevelCacheSize() / sizeof(Scalar));
-    return internal::TensorBlockV2ResourceRequirements::merge(
-        {internal::TensorBlockV2ShapeType::kSkewedInnerDims, target_block_size},
+    return internal::TensorBlockResourceRequirements::merge(
+        {internal::TensorBlockShapeType::kSkewedInnerDims, target_block_size},
         m_impl.getResourceRequirements());
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
           bool /*root_of_expr_ast*/ = false) const {
     // If one of the dimensions is zero, return empty block view.
     if (desc.size() == 0) {
-      return TensorBlockV2(internal::TensorBlockKind::kView, NULL,
+      return TensorBlock(internal::TensorBlockKind::kView, NULL,
                            desc.dimensions());
     }
 
@@ -355,8 +355,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy;
 
     // Prepare storage for the materialized padding result.
-    const typename TensorBlockV2::Storage block_storage =
-        TensorBlockV2::prepareStorage(desc, scratch);
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch);
 
     // Iterate copying data from `m_impl.data()` to the output buffer.
     for (Index size = 0; size < output_size; size += output_inner_dim_size) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 4abe58ecd..64a436e50 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -96,7 +96,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
@@ -104,7 +104,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
  };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index c600c319d..ad14d82f8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -584,7 +584,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
   enum {
     IsAligned = false,
     PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = true,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
@@ -594,7 +594,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index ff5bfad46..4c6420586 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -141,7 +141,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
     enum {
       IsAligned = false,
       PacketAccess = false,
-      BlockAccessV2 = false,
+      BlockAccess = false,
       PreferBlockAccess = false,
       Layout = PlainObjectType::Layout,
       CoordAccess = false,  // to be implemented
@@ -149,7 +149,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
     };
 
     //===- Tensor block evaluation strategy (see TensorBlock.h) -----------===//
-    typedef internal::TensorBlockNotImplemented TensorBlockV2;
+    typedef internal::TensorBlockNotImplemented TensorBlock;
     //===------------------------------------------------------------------===//
 
     EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
@@ -377,7 +377,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
   enum {
     IsAligned = false,
     PacketAccess = false,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = false,
     Layout = TensorRef<Derived>::Layout,
     CoordAccess = false,  // to be implemented
@@ -385,7 +385,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
@@ -430,13 +430,13 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
   enum {
     IsAligned = false,
     PacketAccess = false,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = false,
     RawAccess = false
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 68699351b..c4ac81db8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -115,7 +115,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   enum {
     IsAligned         = false,
     PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2     = NumDims > 0,
+    BlockAccess       = NumDims > 0,
     PreferBlockAccess = true,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess       = false,  // to be implemented
@@ -128,12 +128,12 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
   typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
 
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
       ArgTensorBlock;
 
   typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
                                                      Layout, Index>
-      TensorBlockV2;
+      TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
@@ -245,15 +245,15 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockV2ResourceRequirements getResourceRequirements() const {
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
     const size_t target_block_size =
         numext::maxi<size_t>(1, m_device.lastLevelCacheSize() / sizeof(Scalar));
-    return {internal::TensorBlockV2ShapeType::kSkewedInnerDims,
+    return {internal::TensorBlockShapeType::kSkewedInnerDims,
             target_block_size};
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
           bool /*root_of_expr_ast*/ = false) const {
     // TODO(ezhulenev): If underlying tensor expression supports and prefers
     // block evaluation we must use it. Currently we use coeff and packet
@@ -322,8 +322,8 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     const Index inner_dim_size = it[effective_inner_dim].size;
 
     // Prepare storage for the materialized reverse result.
-    const typename TensorBlockV2::Storage block_storage =
-        TensorBlockV2::prepareStorage(desc, scratch);
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch);
     CoeffReturnType* block_buffer = block_storage.data();
 
     while (it[NumDims - 1].count < it[NumDims - 1].size) {
@@ -433,7 +433,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
@@ -449,7 +449,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
   
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index d8005d604..ee465dd0f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -99,7 +99,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
   enum {
     IsAligned = false,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
@@ -107,7 +107,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 42bca8172..1a6891ffd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -115,7 +115,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   enum {
     IsAligned         = false,
     PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccessV2     = TensorEvaluator<ArgType, Device>::RawAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
     PreferBlockAccess = true,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess       = false,  // to be implemented
@@ -130,7 +130,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
 
   typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
                                                      Layout, Index>
-      TensorBlockV2;
+      TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
@@ -245,7 +245,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockV2ResourceRequirements getResourceRequirements() const {
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
     static const int inner_dim =
         Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
 
@@ -254,23 +254,23 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
 
     const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim;
     return {inner_dim_shuffled
-                ? internal::TensorBlockV2ShapeType::kUniformAllDims
-                : internal::TensorBlockV2ShapeType::kSkewedInnerDims,
+                ? internal::TensorBlockShapeType::kUniformAllDims
+                : internal::TensorBlockShapeType::kSkewedInnerDims,
             target_block_size};
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
           bool root_of_expr_ast = false) const {
     assert(m_impl.data() != NULL);
 
-    typedef internal::TensorBlockIOV2<ScalarNoConst, Index, NumDims, Layout>
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
         TensorBlockIO;
     typedef typename TensorBlockIO::Dst TensorBlockIODst;
     typedef typename TensorBlockIO::Src TensorBlockIOSrc;
 
-    const typename TensorBlockV2::Storage block_storage =
-        TensorBlockV2::prepareStorage(
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(
             desc, scratch, /*allow_strided_storage=*/root_of_expr_ast);
 
     typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
@@ -380,7 +380,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   enum {
     IsAligned         = false,
     PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccessV2     = TensorEvaluator<ArgType, Device>::RawAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
     PreferBlockAccess = true,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess         = false
@@ -414,12 +414,12 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
     }
   }
 
-  template <typename TensorBlockV2>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
-      const TensorBlockDesc& desc, const TensorBlockV2& block) {
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlockDesc& desc, const TensorBlock& block) {
     eigen_assert(this->m_impl.data() != NULL);
 
-    typedef internal::TensorBlockIOV2<ScalarNoConst, Index, NumDims, Layout>
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
         TensorBlockIO;
     typedef typename TensorBlockIO::Dst TensorBlockIODst;
     typedef typename TensorBlockIO::Src TensorBlockIOSrc;
@@ -434,7 +434,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
       ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
 
       typedef internal::TensorBlockAssignment<
-          ScalarNoConst, NumDims, typename TensorBlockV2::XprType, Index>
+          ScalarNoConst, NumDims, typename TensorBlock::XprType, Index>
           TensorBlockAssignment;
 
       TensorBlockAssignment::Run(
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 8a7fcac23..d05f37532 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -114,7 +114,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
@@ -122,7 +122,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
index 209d6fb3b..24d22c189 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -97,7 +97,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
@@ -105,7 +105,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index a4c38f118..000ed5b41 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -183,7 +183,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccessV2 = false,
+    BlockAccess = false,
     PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
@@ -191,7 +191,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) :
diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp
index b56601ebd..33dc2535a 100644
--- a/unsupported/test/cxx11_tensor_block_access.cpp
+++ b/unsupported/test/cxx11_tensor_block_access.cpp
@@ -19,7 +19,7 @@ using Eigen::Tensor;
 using Eigen::Index;
 using Eigen::RowMajor;
 using Eigen::ColMajor;
-using Eigen::internal::TensorBlockV2ShapeType;
+using Eigen::internal::TensorBlockShapeType;
 
 
 template<typename T>
@@ -27,10 +27,10 @@ static const T& choose(int layout, const T& col, const T& row) {
   return layout == ColMajor ? col : row;
 }
 
-static TensorBlockV2ShapeType RandomShape() {
+static TensorBlockShapeType RandomShape() {
   return internal::random<bool>()
-         ? TensorBlockV2ShapeType::kUniformAllDims
-         : TensorBlockV2ShapeType::kSkewedInnerDims;
+         ? TensorBlockShapeType::kUniformAllDims
+         : TensorBlockShapeType::kSkewedInnerDims;
 }
 
 template <int NumDims>
@@ -67,13 +67,13 @@ static void Debug(DSizes<Index, NumDims> dims) {
 template <int Layout>
 static void test_block_mapper_sanity()
 {
-  typedef internal::TensorBlockV2Mapper<2, Layout> TensorBlockMapper;
+  typedef internal::TensorBlockMapper<2, Layout> TensorBlockMapper;
 
   DSizes<Index, 2> tensor_dims(100, 100);
 
   // Test uniform blocks.
   TensorBlockMapper uniform_block_mapper(
-      tensor_dims, {TensorBlockV2ShapeType::kUniformAllDims, 100});
+      tensor_dims, {TensorBlockShapeType::kUniformAllDims, 100});
 
   VERIFY_IS_EQUAL(uniform_block_mapper.blockCount(), 100);
   VERIFY_IS_EQUAL(uniform_block_mapper.blockTotalSize(), 100);
@@ -85,7 +85,7 @@ static void test_block_mapper_sanity()
 
   // Test skewed to inner dims blocks.
   TensorBlockMapper skewed_block_mapper(
-      tensor_dims, {TensorBlockV2ShapeType::kSkewedInnerDims, 100});
+      tensor_dims, {TensorBlockShapeType::kSkewedInnerDims, 100});
 
   VERIFY_IS_EQUAL(skewed_block_mapper.blockCount(), 100);
   VERIFY_IS_EQUAL(skewed_block_mapper.blockTotalSize(), 100);
@@ -121,7 +121,7 @@ static void UpdateCoeffSet(
 
 template <typename T, int NumDims, int Layout>
 static void test_block_mapper_maps_every_element() {
-  typedef internal::TensorBlockV2Mapper<NumDims, Layout> TensorBlockMapper;
+  typedef internal::TensorBlockMapper<NumDims, Layout> TensorBlockMapper;
 
   DSizes<Index, NumDims> dims = RandomDims<NumDims>();
   DSizes<Index, NumDims> strides = internal::strides<Layout>(dims);
@@ -227,14 +227,14 @@ template <int Layout>
 static void test_uniform_block_shape()
 {
   typedef internal::TensorBlockDescriptor<5> TensorBlock;
-  typedef internal::TensorBlockV2Mapper<5, Layout> TensorBlockMapper;
+  typedef internal::TensorBlockMapper<5, Layout> TensorBlockMapper;
 
   {
     // Test shape 'UniformAllDims' with uniform 'max_coeff count'.
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 5 * 5 * 5 * 5 * 5;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims,
+        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     for (int i = 0; i < 5; ++i) {
@@ -249,7 +249,7 @@ static void test_uniform_block_shape()
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 7 * 5 * 5 * 5 * 5;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims,
+        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[0]);
@@ -261,7 +261,7 @@ static void test_uniform_block_shape()
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 5 * 5 * 5 * 5 * 6;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims,
+        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(6, block.dimensions()[4]);
@@ -277,7 +277,7 @@ static void test_uniform_block_shape()
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 11 * 5 * 5 * 5 * 5;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims,
+        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(11, block.dimensions()[0]);
@@ -289,7 +289,7 @@ static void test_uniform_block_shape()
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 5 * 5 * 5 * 5 * 7;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims,
+        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[4]);
@@ -305,7 +305,7 @@ static void test_uniform_block_shape()
     DSizes<Index, 5> dims(7, 5, 6, 17, 7);
     const Index max_coeff_count = 7 * 5 * 6 * 7 * 5;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims,
+        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[0]);
@@ -318,7 +318,7 @@ static void test_uniform_block_shape()
     DSizes<Index, 5> dims(7, 5, 6, 9, 7);
     const Index max_coeff_count = 5 * 5 * 5 * 6 * 7;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims,
+        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[4]);
@@ -334,7 +334,7 @@ static void test_uniform_block_shape()
     DSizes<Index, 5> dims(7, 5, 6, 17, 7);
     const Index max_coeff_count = 7 * 5 * 6 * 17 * 7;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims,
+        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[0]);
@@ -347,7 +347,7 @@ static void test_uniform_block_shape()
     DSizes<Index, 5> dims(7, 5, 6, 9, 7);
     const Index max_coeff_count = 7 * 5 * 6 * 9 * 7;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims,
+        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[4]);
@@ -363,14 +363,14 @@ template <int Layout>
 static void test_skewed_inner_dim_block_shape()
 {
   typedef internal::TensorBlockDescriptor<5> TensorBlock;
-  typedef internal::TensorBlockV2Mapper<5, Layout> TensorBlockMapper;
+  typedef internal::TensorBlockMapper<5, Layout> TensorBlockMapper;
 
   // Test shape 'SkewedInnerDims' with partial allocation to inner-most dim.
   if (Layout == ColMajor) {
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 10 * 1 * 1 * 1 * 1;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims,
+        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(10, block.dimensions()[0]);
@@ -382,7 +382,7 @@ static void test_skewed_inner_dim_block_shape()
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 1 * 1 * 1 * 1 * 6;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims,
+        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(6, block.dimensions()[4]);
@@ -397,7 +397,7 @@ static void test_skewed_inner_dim_block_shape()
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 11 * 1 * 1 * 1 * 1;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims,
+        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(11, block.dimensions()[0]);
@@ -409,7 +409,7 @@ static void test_skewed_inner_dim_block_shape()
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 1 * 1 * 1 * 1 * 7;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims,
+        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[4]);
@@ -425,7 +425,7 @@ static void test_skewed_inner_dim_block_shape()
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 11 * 3 * 1 * 1 * 1;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims,
+        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(11, block.dimensions()[0]);
@@ -438,7 +438,7 @@ static void test_skewed_inner_dim_block_shape()
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 1 * 1 * 1 * 15 * 7;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims,
+        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[4]);
@@ -455,7 +455,7 @@ static void test_skewed_inner_dim_block_shape()
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 11 * 5 * 5 * 1 * 1;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims,
+        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(11, block.dimensions()[0]);
@@ -469,7 +469,7 @@ static void test_skewed_inner_dim_block_shape()
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 1 * 1 * 5 * 17 * 7;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims,
+        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[4]);
@@ -486,7 +486,7 @@ static void test_skewed_inner_dim_block_shape()
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 11 * 5 * 6 * 17 * 7;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims,
+        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(11, block.dimensions()[0]);
@@ -499,7 +499,7 @@ static void test_skewed_inner_dim_block_shape()
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 11 * 5 * 6 * 17 * 7;
     TensorBlockMapper
-        block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims,
+        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
                             max_coeff_count});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[4]);
@@ -512,7 +512,7 @@ static void test_skewed_inner_dim_block_shape()
 }
 
 template <int Layout>
-static void test_empty_dims(const internal::TensorBlockV2ShapeType block_shape)
+static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
 {
   // Test blocking of tensors with zero dimensions:
   //  - we must not crash on asserts and divisions by zero
@@ -520,7 +520,7 @@ static void test_empty_dims(const internal::TensorBlockV2ShapeType block_shape)
   //    (recipe for overflows/underflows, divisions by zero and NaNs later)
   //  - total block count must be zero
   {
-    typedef internal::TensorBlockV2Mapper<1, Layout> TensorBlockMapper;
+    typedef internal::TensorBlockMapper<1, Layout> TensorBlockMapper;
 
     DSizes<Index, 1> dims(0);
     for (size_t max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
@@ -531,7 +531,7 @@ static void test_empty_dims(const internal::TensorBlockV2ShapeType block_shape)
   }
 
   {
-    typedef internal::TensorBlockV2Mapper<2, Layout> TensorBlockMapper;
+    typedef internal::TensorBlockMapper<2, Layout> TensorBlockMapper;
 
     for (int dim1 = 0; dim1 < 3; ++dim1) {
       for (int dim2 = 0; dim2 < 3; ++dim2) {
@@ -573,8 +573,8 @@ EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
   TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element);
   TEST_LAYOUTS(test_uniform_block_shape);
   TEST_LAYOUTS(test_skewed_inner_dim_block_shape);
-  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockV2ShapeType::kUniformAllDims);
-  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockV2ShapeType::kSkewedInnerDims);
+  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims);
+  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims);
 }
 
 #undef TEST_LAYOUTS
diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp
index 700e84a19..4a785dcdc 100644
--- a/unsupported/test/cxx11_tensor_block_eval.cpp
+++ b/unsupported/test/cxx11_tensor_block_eval.cpp
@@ -61,9 +61,9 @@ static TensorBlockParams<NumDims> RandomBlock(DSizes<Index, NumDims> dims,
 template <int Layout, int NumDims>
 static TensorBlockParams<NumDims> SkewedInnerBlock(
     DSizes<Index, NumDims> dims) {
-  using BlockMapper = internal::TensorBlockV2Mapper<NumDims, Layout, Index>;
+  using BlockMapper = internal::TensorBlockMapper<NumDims, Layout, Index>;
   BlockMapper block_mapper(dims,
-                           {internal::TensorBlockV2ShapeType::kSkewedInnerDims,
+                           {internal::TensorBlockShapeType::kSkewedInnerDims,
                             internal::random<size_t>(1, dims.TotalSize())});
 
   Index total_blocks = block_mapper.blockCount();
@@ -158,7 +158,7 @@ static void VerifyBlockEvaluator(Expression expr, GenBlockParams gen_block) {
   }
 
   const bool root_of_expr = internal::random<bool>();
-  auto tensor_block = eval.blockV2(block_params.desc, scratch, root_of_expr);
+  auto tensor_block = eval.block(block_params.desc, scratch, root_of_expr);
 
   if (tensor_block.kind() == internal::TensorBlockKind::kMaterializedInOutput) {
     // Copy data from destination buffer.
@@ -596,7 +596,7 @@ static void VerifyBlockAssignment(Tensor<T, NumDims, Layout>& tensor,
   tensor.setZero();
 
   // Use evaluator to write block into a tensor.
-  eval.writeBlockV2(block_params.desc, blk);
+  eval.writeBlock(block_params.desc, blk);
 
   // Make a copy of the result after assignment.
   Tensor<T, NumDims, Layout> block_assigned = tensor;
diff --git a/unsupported/test/cxx11_tensor_block_io.cpp b/unsupported/test/cxx11_tensor_block_io.cpp
index 6f318d9fe..25584433e 100644
--- a/unsupported/test/cxx11_tensor_block_io.cpp
+++ b/unsupported/test/cxx11_tensor_block_io.cpp
@@ -22,10 +22,10 @@ static DSizes<Index, NumDims> RandomDims(Index min, Index max) {
   return DSizes<Index, NumDims>(dims);
 }
 
-static internal::TensorBlockV2ShapeType RandomBlockShape() {
+static internal::TensorBlockShapeType RandomBlockShape() {
   return internal::random<bool>()
-         ? internal::TensorBlockV2ShapeType::kUniformAllDims
-         : internal::TensorBlockV2ShapeType::kSkewedInnerDims;
+         ? internal::TensorBlockShapeType::kUniformAllDims
+         : internal::TensorBlockShapeType::kSkewedInnerDims;
 }
 
 template <int NumDims>
@@ -60,7 +60,7 @@ static Index GetInputIndex(Index output_index,
 
 template <typename T, int NumDims, int Layout>
 static void test_block_io_copy_data_from_source_to_target() {
-  using TensorBlockIO = internal::TensorBlockIOV2<T, Index, NumDims, Layout>;
+  using TensorBlockIO = internal::TensorBlockIO<T, Index, NumDims, Layout>;
   using IODst = typename TensorBlockIO::Dst;
   using IOSrc = typename TensorBlockIO::Src;
 
@@ -74,7 +74,7 @@ static void test_block_io_copy_data_from_source_to_target() {
 
   // Construct a tensor block mapper.
   using TensorBlockMapper =
-      internal::TensorBlockV2Mapper<NumDims, Layout, Index>;
+      internal::TensorBlockMapper<NumDims, Layout, Index>;
   TensorBlockMapper block_mapper(dims, {RandomBlockShape(),
                                         RandomTargetBlockSize(dims)});
 
@@ -145,7 +145,7 @@ static void test_block_io_copy_using_reordered_dimensions() {
   // Construct a tensor block mapper.
   // NOTE: Tensor block mapper works with shuffled dimensions.
   using TensorBlockMapper =
-      internal::TensorBlockV2Mapper<NumDims, Layout, Index>;
+      internal::TensorBlockMapper<NumDims, Layout, Index>;
   TensorBlockMapper block_mapper(output_tensor_dims, {RandomBlockShape(),
                                  RandomTargetBlockSize(output_tensor_dims)});
 
@@ -169,7 +169,7 @@ static void test_block_io_copy_using_reordered_dimensions() {
 
     // NOTE: Block dimensions are in the same order as output dimensions.
 
-    using TensorBlockIO = internal::TensorBlockIOV2<T, Index, NumDims, Layout>;
+    using TensorBlockIO = internal::TensorBlockIO<T, Index, NumDims, Layout>;
     using IODst = typename TensorBlockIO::Dst;
     using IOSrc = typename TensorBlockIO::Src;
 
@@ -181,7 +181,7 @@ static void test_block_io_copy_using_reordered_dimensions() {
       IODst dst(blk_dims, blk_strides, block_data, 0);
       IOSrc src(input_strides, input_data, first_coeff_index);
 
-      // TODO(ezhulenev): Remove when fully switched to TensorBlockV2.
+      // TODO(ezhulenev): Remove when fully switched to TensorBlock.
       DSizes<int, NumDims> dim_map;
       for (int j = 0; j < NumDims; ++j)
         dim_map[j] = static_cast<int>(output_to_input_dim_map[j]);
@@ -199,7 +199,7 @@ static void test_block_io_copy_using_reordered_dimensions() {
       IODst dst(dst_dims, input_strides, output_data, first_coeff_index);
       IOSrc src(blk_strides, block_data, 0);
 
-      // TODO(ezhulenev): Remove when fully switched to TensorBlockV2.
+      // TODO(ezhulenev): Remove when fully switched to TensorBlock.
       DSizes<int, NumDims> dim_map;
       for (int j = 0; j < NumDims; ++j)
         dim_map[j] = static_cast<int>(input_to_output_dim_map[j]);
@@ -235,7 +235,7 @@ static void test_block_io_copy_using_reordered_dimensions_do_not_squeeze() {
   float* tensor_data = tensor.data();
   float* block_data = block.data();
 
-  using TensorBlockIO = internal::TensorBlockIOV2<float, Index, 3, Layout>;
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 3, Layout>;
   using IODst = typename TensorBlockIO::Dst;
   using IOSrc = typename TensorBlockIO::Src;
 
@@ -283,7 +283,7 @@ static void test_block_io_copy_using_reordered_dimensions_squeeze() {
   float* tensor_data = tensor.data();
   float* block_data = block.data();
 
-  using TensorBlockIO = internal::TensorBlockIOV2<float, Index, 4, Layout>;
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 4, Layout>;
   using IODst = typename TensorBlockIO::Dst;
   using IOSrc = typename TensorBlockIO::Src;
 
@@ -334,7 +334,7 @@ static void test_block_io_zero_stride() {
   Tensor<float, 5, Layout> output(output_tensor_dims);
   output.setRandom();
 
-  using TensorBlockIO = internal::TensorBlockIOV2<float, Index, 5, Layout>;
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 5, Layout>;
   using IODst = typename TensorBlockIO::Dst;
   using IOSrc = typename TensorBlockIO::Src;
 
@@ -360,7 +360,7 @@ static void test_block_io_zero_stride() {
 
 template <int Layout>
 static void test_block_io_squeeze_ones() {
-  using TensorBlockIO = internal::TensorBlockIOV2<float, Index, 5, Layout>;
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 5, Layout>;
   using IODst = typename TensorBlockIO::Dst;
   using IOSrc = typename TensorBlockIO::Src;
 
-- 
cgit v1.2.3