From 1c879eb010df8e53e5ac016ee5d155db2c721c2b Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 10 Dec 2019 15:40:23 -0800 Subject: Remove V2 suffix from TensorBlock --- unsupported/Eigen/CXX11/Tensor | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 8 +- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 16 +- unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 1481 ++++++++++++++++++++ unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h | 1481 -------------------- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 50 +- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 34 +- .../Eigen/CXX11/src/Tensor/TensorConcatenation.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorConversion.h | 14 +- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 8 +- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 10 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 110 +- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 16 +- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 14 +- .../CXX11/src/Tensor/TensorForwardDeclarations.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 16 +- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorInflation.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 58 +- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 20 +- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 4 +- unsupported/Eigen/CXX11/src/Tensor/TensorRef.h | 12 +- unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 22 +- unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 32 +- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 4 +- unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 4 +- unsupported/test/cxx11_tensor_block_access.cpp | 68 +- unsupported/test/cxx11_tensor_block_eval.cpp | 8 +- unsupported/test/cxx11_tensor_block_io.cpp | 26 +- 38 files changed, 1793 insertions(+), 1793 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 10786048e..2640f9565 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -97,7 +97,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorGlobalFunctions.h" #include "src/Tensor/TensorBase.h" -#include "src/Tensor/TensorBlockV2.h" +#include "src/Tensor/TensorBlock.h" #include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorExpr.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index 68bfd141a..91a6f8d6c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -88,7 +88,7 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = /*TensorEvaluator::PacketAccess*/ false, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -96,7 +96,7 @@ struct TensorEvaluator, Device> }; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -229,7 +229,7 @@ struct TensorEvaluator, Devi enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = /*TensorEvaluator::PacketAccess*/ false, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator >, Device>::Layout, CoordAccess = false, // to be implemented @@ -237,7 +237,7 @@ struct TensorEvaluator, Devi }; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 22d672aa4..72f072cf2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -108,8 +108,8 @@ struct TensorEvaluator, Device> TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - BlockAccessV2 = TensorEvaluator::BlockAccessV2 & - TensorEvaluator::BlockAccessV2, + BlockAccess = TensorEvaluator::BlockAccess & + TensorEvaluator::BlockAccess, PreferBlockAccess = TensorEvaluator::PreferBlockAccess | TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, @@ -120,7 +120,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockDescriptor TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - typedef typename TensorEvaluator::TensorBlockV2 + typedef typename TensorEvaluator::TensorBlock RightTensorBlock; //===--------------------------------------------------------------------===// @@ -201,13 +201,13 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockV2ResourceRequirements getResourceRequirements() const { - return internal::TensorBlockV2ResourceRequirements::merge( + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::merge( m_leftImpl.getResourceRequirements(), m_rightImpl.getResourceRequirements()); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2( + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock( TensorBlockDesc& desc, TensorBlockScratch& scratch) { if (TensorEvaluator::RawAccess && m_leftImpl.data() != NULL) { @@ -218,10 +218,10 @@ struct TensorEvaluator, Device> /*dst_strides=*/internal::strides(m_leftImpl.dimensions())); } - RightTensorBlock block = m_rightImpl.blockV2(desc, scratch, /*root_of_expr_ast=*/true); + RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true); // If block was evaluated into a destination, there is no need to do assignment. if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { - m_leftImpl.writeBlockV2(desc, block); + m_leftImpl.writeBlock(desc, block); } block.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h new file mode 100644 index 000000000..222333847 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -0,0 +1,1481 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H +#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H + +namespace Eigen { +namespace internal { + +// -------------------------------------------------------------------------- // +// Forward declarations for templates defined below. +template +class TensorBlockIO; + +// -------------------------------------------------------------------------- // +// Helper function to compute strides for densely stored buffer of given +// dimensions. + +// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use +// this function instead everywhere. +template +EIGEN_ALWAYS_INLINE DSizes strides( + const DSizes& dimensions) { + DSizes strides; + if (NumDims == 0) return strides; + + // TODO(ezhulenev): Use templates to unroll this loop (similar to + // h_array_reduce in CXX11meta.h)? Benchmark it. + if (static_cast(Layout) == static_cast(ColMajor)) { + strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + strides[i] = strides[i - 1] * dimensions[i - 1]; + } + } else { + strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dimensions[i + 1]; + } + } + + return strides; +} + +template +EIGEN_ALWAYS_INLINE DSizes strides( + const Eigen::array& dimensions) { + return strides(DSizes(dimensions)); +} + +template +EIGEN_STRONG_INLINE DSizes strides( + const Sizes& sizes) { + return strides(DSizes(sizes)); +} + +// -------------------------------------------------------------------------- // + +// Tensor block shape type defines what are the shape preference for the blocks +// extracted from the larger tensor. +// +// Example: blocks of 100 elements from the large 100x100 tensor: +// - tensor: 100x100 +// - target_block_size: 100 +// +// TensorBlockShapeType: +// - kUniformAllDims: 100 blocks of size 10x10 +// - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column +// or row major layout) +enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims }; + +struct TensorBlockResourceRequirements { + TensorBlockShapeType shape_type; + size_t size; + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE TensorBlockResourceRequirements + merge(const TensorBlockResourceRequirements &lhs, + const TensorBlockResourceRequirements &rhs) { + return {merge(lhs.shape_type, rhs.shape_type), merge(rhs.size, lhs.size)}; + } + + // This is a resource requirement that should be returned from expressions + // that do not have any block evaluation preference (e.g. default tensor + // expression with raw buffer access). + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() { + return {TensorBlockShapeType::kUniformAllDims, 1}; + } + +private: + using Requirements = TensorBlockResourceRequirements; + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) { + return numext::maxi(lhs_size, rhs_size); + } + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs, + TensorBlockShapeType rhs) { + return (lhs == TensorBlockShapeType::kSkewedInnerDims || + rhs == TensorBlockShapeType::kSkewedInnerDims) + ? TensorBlockShapeType::kSkewedInnerDims + : TensorBlockShapeType::kUniformAllDims; + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockDescriptor specifies a block offset within a tensor and the block +// sizes along each of the tensor dimensions. + +template +class TensorBlockDescriptor { + public: + typedef DSizes Dimensions; + + // If we evaluate a Tensor assignment, and expression on the left, already has + // a memory buffer, then we might do performance optimization, and evaluate + // the root expression directly into the final output memory. Some time it's + // possible to reuse it for materializing subexpressions inside an expression + // tree, to to avoid dynamic memory allocation. + // + // The pointer type of the underlying storage is erased, because passing + // Scalar type through all the expression evaluation layers is way too many + // templates. In practice destination buffer type should always match the + // evaluated expression scalar type. + class DestinationBuffer { + public: + enum DestinationBufferKind : int { + // The above explicit specification of "int" as the enum basetype is needed + // to get around a HIPCC link error ("the field type is not amp-compatible") + // which is issued for class members with the enum type. + // TODO(rocm): + // remove the "int" basetype once HIPCC has been fixed to not error out + // in the above scenario. + + // Destination buffer is not defined (`m_data` == nullptr). + kEmpty, + + // Tensor block defined by an owning tensor block descriptor can fit + // contiguously into the destination buffer. In this case it's safe to + // materialize tensor block in the destination buffer, wrap it in a + // TensorMap, and use to build Eigen expression on top of it. + kContiguous, + + // Destination buffer strides do not match strides of the contiguously + // stored block, and it's impossible to define a TensorMap over this + // buffer. However if we are evaluating a root of an expression tree, we + // still can materialize an output into this destination, because we can + // guarantee that no one will ever access it through block API. + // + // In theory it is possible to build valid TensorStriding + // expression on top of this destination buffer, however it has + // inefficient coeff/packet access, and defeats the purpose of fast block + // evaluation API. + kStrided + }; + + template + Scalar* data() const { + eigen_assert(m_data_type_size == sizeof(Scalar)); + return static_cast(m_data); + } + + const Dimensions& strides() const { return m_strides; } + const DestinationBufferKind& kind() const { return m_kind; } + + private: + friend class TensorBlockDescriptor; + + DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {} + + template + DestinationBuffer(Scalar* data, const Dimensions& strides, + DestinationBufferKind kind) + : m_data(static_cast(data)), + m_data_type_size(sizeof(Scalar)), + m_strides(strides), + m_kind(kind) {} + + template + static DestinationBuffer make(const TensorBlockDescriptor& desc, + Scalar* data, const Dimensions& strides) { + return DestinationBuffer(data, strides, kind(desc, strides)); + } + + template + static DestinationBufferKind kind(const TensorBlockDescriptor& desc, + const Dimensions& strides) { + const Dimensions& desc_dims = desc.dimensions(); + const Dimensions& desc_strides = internal::strides(desc_dims); + for (int i = 0; i < NumDims; ++i) { + if (desc_dims[i] == 1) continue; + if (desc_strides[i] != strides[i]) return kStrided; + } + return kContiguous; + } + + // Storage pointer is type erased, to reduce template bloat, but we still + // keep the size of the underlying element type for error checking. + void* m_data; + size_t m_data_type_size; + + // Destination buffer dimensions always match the dimensions of a tensor + // block descriptor it belongs to, however strides might be different. + Dimensions m_strides; + + DestinationBufferKind m_kind; + }; + + TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, + const DestinationBuffer& destination) + : m_offset(offset), + m_dimensions(dimensions), + m_destination(destination) {} + + TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions) + : m_offset(offset), + m_dimensions(dimensions), + m_destination(DestinationBuffer()) {} + + IndexType offset() const { return m_offset; } + const Dimensions& dimensions() const { return m_dimensions; } + IndexType dimension(int index) const { return m_dimensions[index]; } + IndexType size() const { return array_prod(m_dimensions); } + + const DestinationBuffer& destination() const { return m_destination; } + + template + void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) { + eigen_assert(dst_base != NULL); + m_destination = + DestinationBuffer::template make(*this, dst_base, dst_strides); + } + + template + void AddDestinationBuffer( + Scalar* dst_base, + const DSizes& dst_strides) { + // DSizes constructor will do index type promotion if it's safe. + AddDestinationBuffer(dst_base, Dimensions(dst_strides)); + } + + TensorBlockDescriptor& DropDestinationBuffer() { + m_destination.m_data = NULL; + m_destination.m_kind = DestinationBuffer::kEmpty; + return *this; + } + + bool HasDestinationBuffer() const { + return m_destination.kind() != DestinationBuffer::kEmpty; + } + + // Returns a copy of `*this` with updated offset. + TensorBlockDescriptor WithOffset(IndexType offset) const { + return TensorBlockDescriptor(offset, m_dimensions, m_destination); + } + + private: + // Offset and dimensions are immutable after construction. Block descriptor + // can only be mutated by adding or dropping destination. + const IndexType m_offset; + const Dimensions m_dimensions; + DestinationBuffer m_destination; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockMapper is responsible for iterating over the blocks of a tensor. + +template +class TensorBlockMapper { + typedef TensorBlockDescriptor BlockDescriptor; + + public: + typedef DSizes Dimensions; + + TensorBlockMapper() = default; + TensorBlockMapper(const DSizes& dimensions, + const TensorBlockResourceRequirements& requirements) + : m_tensor_dimensions(dimensions), m_requirements(requirements) { + // Initialize `m_block_dimensions`. + InitializeBlockDimensions(); + + // Calculate block counts by dimension and total block count. + DSizes block_count; + for (int i = 0; i < NumDims; ++i) { + block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]); + } + m_total_block_count = array_prod(block_count); + + // Calculate block strides (used for enumerating blocks). + m_tensor_strides = strides(m_tensor_dimensions); + m_block_strides = strides(block_count); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const { + return m_total_block_count; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const { + return m_block_dimensions.TotalSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& + blockDimensions() const { + return m_block_dimensions; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + BlockDescriptor blockDescriptor(IndexType block_index) const { + static const bool isColMajor = Layout == static_cast(ColMajor); + + IndexType offset = 0; + DSizes dimensions; + + if (NumDims == 0) return BlockDescriptor(offset, dimensions); + + // Iterate outer -> inner dimensions. + for (int i = NumDims - 1; i >= 0; --i) { + const int dim = isColMajor ? i : NumDims - i - 1; + + const IndexType idx = block_index / m_block_strides[dim]; + block_index -= idx * m_block_strides[dim]; + + const IndexType coord = idx * m_block_dimensions[dim]; + dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, + m_block_dimensions[dim]); + offset += coord * m_tensor_strides[dim]; + } + + return {offset, dimensions}; + } + + private: + void InitializeBlockDimensions() { + // Requested block shape and size. + const TensorBlockShapeType shape_type = m_requirements.shape_type; + const IndexType target_block_size = + numext::maxi(1, static_cast(m_requirements.size)); + + // Corner case: one of the dimensions is zero. Logic below is too complex + // to handle this case on a general basis, just use unit block size. + // Note: we must not yield blocks with zero dimensions (recipe for + // overflows/underflows, divisions by zero and NaNs later). + if (m_tensor_dimensions.TotalSize() == 0) { + for (int i = 0; i < NumDims; ++i) { + m_block_dimensions[i] = 1; + } + return; + } + + // If tensor fits into a target block size, evaluate it as a single block. + if (m_tensor_dimensions.TotalSize() <= target_block_size) { + m_block_dimensions = m_tensor_dimensions; + return; + } + + static const bool isColMajor = Layout == static_cast(ColMajor); + + // Block shape skewed towards inner dimension. + if (shape_type == TensorBlockShapeType::kSkewedInnerDims) { + IndexType coeff_to_allocate = target_block_size; + + for (int i = 0; i < NumDims; ++i) { + const int dim = isColMajor ? i : NumDims - i - 1; + m_block_dimensions[dim] = + numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]); + coeff_to_allocate = divup( + coeff_to_allocate, + numext::maxi(static_cast(1), m_block_dimensions[dim])); + } + eigen_assert(coeff_to_allocate == 1); + + } else if (shape_type == TensorBlockShapeType::kUniformAllDims) { + // Tensor will not fit within 'target_block_size' budget: calculate tensor + // block dimension sizes based on "square" dimension size target. + const IndexType dim_size_target = convert_index( + std::pow(static_cast(target_block_size), + 1.0f / static_cast(m_block_dimensions.rank()))); + + for (int i = 0; i < NumDims; ++i) { + // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it + // a multiple of the packet size. Note that reducing + // 'block_dim_size' in this manner can increase the number of + // blocks, and so will amplify any per-block overhead. + m_block_dimensions[i] = + numext::mini(dim_size_target, m_tensor_dimensions[i]); + } + + // Add any un-allocated coefficients to inner dimension(s). + IndexType total_size = m_block_dimensions.TotalSize(); + for (int i = 0; i < NumDims; ++i) { + const int dim = isColMajor ? i : NumDims - i - 1; + + if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) { + const IndexType total_size_other_dims = + total_size / m_block_dimensions[dim]; + const IndexType alloc_avail = + divup(target_block_size, total_size_other_dims); + if (alloc_avail == m_block_dimensions[dim]) { + // Insufficient excess coefficients to allocate. + break; + } + m_block_dimensions[dim] = + numext::mini(m_tensor_dimensions[dim], alloc_avail); + total_size = total_size_other_dims * m_block_dimensions[dim]; + } + } + + } else { + eigen_assert(false); // unknown block shape + } + + eigen_assert(m_block_dimensions.TotalSize() >= + numext::mini(target_block_size, + m_tensor_dimensions.TotalSize())); + } + + DSizes m_tensor_dimensions; + TensorBlockResourceRequirements m_requirements; + + DSizes m_block_dimensions; + IndexType m_total_block_count; + + DSizes m_tensor_strides; + DSizes m_block_strides; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockScratchAllocator is responsible for allocating temporary buffers +// for block evaluation (output or input block materialization). Given that +// Eigen expression traversal order is deterministic, all temporary allocations +// are happening in the same order, and usually have exactly the same size. +// Scratch allocator keeps a trace of all dynamic allocations, and after the +// first block evaluation is completed, we should be able to reuse all the +// temporary buffers for the next block evaluation. + +template +class TensorBlockScratchAllocator { + public: + explicit TensorBlockScratchAllocator(const Device& device) + : m_device(device), m_allocation_index(0) {} + + ~TensorBlockScratchAllocator() { + for (size_t i = 0; i < m_allocations.size(); ++i) { + m_device.deallocate(m_allocations[i].ptr); + } + } + + void* allocate(size_t size) { + // TODO(ezhulenev): Remove when replaced with inlined vector. + if (m_allocations.capacity() == 0) m_allocations.reserve(8); + + // Check if we already have an existing allocation att current index. + const int num_allocations = static_cast(m_allocations.size()); + const bool has_allocation = m_allocation_index < num_allocations; + + // Allocation index can't be larger than the number of allocations. + eigen_assert(m_allocation_index <= num_allocations); + + // If we have existing allocation, and its size is larger or equal to + // requested size, we do nothing. + + // If current allocation can't fit requested size, we deallocate it, and + // replace with a larger allocation. + if (has_allocation && m_allocations[m_allocation_index].size < size) { + m_device.deallocate(m_allocations[m_allocation_index].ptr); + m_allocations[m_allocation_index].ptr = m_device.allocate(size); + m_allocations[m_allocation_index].size = size; + } + + // Make a new allocation if we don't have and existing one. + if (!has_allocation) { + Allocation allocation; + allocation.ptr = m_device.allocate(size); + allocation.size = size; + m_allocations.push_back(allocation); + } + + eigen_assert(m_allocations[m_allocation_index].ptr != NULL); + eigen_assert(m_allocations[m_allocation_index].size >= size); + + return m_allocations[m_allocation_index++].ptr; + } + + void reset() { m_allocation_index = 0; } + + private: + struct Allocation { + void* ptr; + size_t size; + }; + + const Device& m_device; + int m_allocation_index; + // TODO(ezhulenev): This should be an inlined vector. + std::vector m_allocations; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockKind represents all possible block kinds, that can be produced by +// TensorEvaluator::evalBlock function. +enum TensorBlockKind { + // Tensor block that is a lazy expression that must be assigned to a + // destination using TensorBlockAssign. + kExpr, + + // Tensor block that is a view into a memory buffer owned by an underlying + // Tensor expression (e.g. it can be a view into a Tensor buffer). + kView, + + // Tensor block that was materialized in a scratch memory buffer, allocated + // with TensorBlockScratchAllocator. This block must be copied to a + // destination, similar to a block of `kExpr` type. + kMaterializedInScratch, + + // Tensor block that was materialized directly into the final output memory + // buffer. For example if the left side of an assignment is a Tensor, we can + // directly materialize the block in the destination memory. + // + // If strides in the output buffer do not match tensor block strides, the + // Tensor expression will be invalid, and should not be used by + // TensorBlockAssign or for constructing another block expression. + kMaterializedInOutput +}; + +// -------------------------------------------------------------------------- // +// TensorBlockNotImplemented should be used to defined TensorBlock typedef in +// TensorEvaluators that do not support block evaluation. + +class TensorBlockNotImplemented { + public: + typedef void XprType; +}; + +// -------------------------------------------------------------------------- // +// XprScalar extracts Scalar type from the Eigen expressions (if expression type +// is not void). It's required to be able to define lazy block expression for +// argument types, that do not support block evaluation. + +template +struct XprScalar { + typedef typename XprType::Scalar type; +}; +template <> +struct XprScalar { + typedef void type; +}; + +// -------------------------------------------------------------------------- // +// TensorMaterializedBlock is a fully evaluated block of the original tensor, +// and XprType is just a TensorMap over the data. This block type is typically +// used to materialize blocks of tensor expressions, that can't be efficiently +// represented as lazy Tensor expressions with fast coeff/packet operations, +// e.g. we materialize all broadcasts into evaluated blocks. +// +// TensorMaterializedBlock does not own its memory buffer, it's either a memory +// buffer that backs the original expression (e.g. block is just a view into a +// Tensor), or a memory buffer allocated with scratch allocator, and in this +// case the scratch allocator will deallocate it at the end of block based +// expression execution. +// +// If the block was evaluated directly into the output buffer, and strides in +// the output buffer do not match block strides, the TensorMap expression will +// be invalid, and should never be used in block assignment or any other tensor +// expression. + +template +class TensorMaterializedBlock { + public: + typedef DSizes Dimensions; + typedef TensorMap > XprType; + + TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, + const Dimensions& dimensions, bool valid_expr = true) + : m_kind(kind), + m_data(data), + m_dimensions(dimensions), + m_expr(m_data, m_dimensions), + m_valid_expr(valid_expr) { + eigen_assert(m_kind == internal::TensorBlockKind::kView || + m_kind == internal::TensorBlockKind::kMaterializedInScratch || + m_kind == internal::TensorBlockKind::kMaterializedInOutput); + } + + TensorBlockKind kind() const { return m_kind; } + // NOTE(ezhulenev): Returning XprType by value like in other block types + // causes asan failures. The theory is that XprType::Nested doesn't work + // properly for TensorMap. + const XprType& expr() const { + eigen_assert(m_valid_expr); + return m_expr; + } + const Scalar* data() const { return m_data; } + void cleanup() {} + + typedef internal::TensorBlockDescriptor TensorBlockDesc; + + // TensorMaterializedBlock can be backed by different types of storage: + // + // (1) Contiguous block of memory allocated with scratch allocator. + // (2) Contiguous block of memory reused from tensor block descriptor + // destination buffer. + // (3) Strided block of memory reused from tensor block descriptor + // destination buffer. + // + class Storage { + public: + Scalar* data() const { return m_data; } + const Dimensions& dimensions() const { return m_dimensions; } + const Dimensions& strides() const { return m_strides; } + + TensorMaterializedBlock AsTensorMaterializedBlock() const { + return TensorMaterializedBlock( + m_materialized_in_output + ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + m_data, m_dimensions, !m_strided_storage); + } + + private: + friend class TensorMaterializedBlock; + + Storage(Scalar* data, const Dimensions& dimensions, + const Dimensions& strides, bool materialized_in_output, + bool strided_storage) + : m_data(data), + m_dimensions(dimensions), + m_strides(strides), + m_materialized_in_output(materialized_in_output), + m_strided_storage(strided_storage) {} + + Scalar* m_data; + Dimensions m_dimensions; + Dimensions m_strides; + bool m_materialized_in_output; + bool m_strided_storage; + }; + + // Creates a storage for materialized block either from the block descriptor + // destination buffer, or allocates a new buffer with scratch allocator. + template + EIGEN_STRONG_INLINE static Storage prepareStorage( + TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool allow_strided_storage = false) { + // Try to reuse destination as an output block buffer. + typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer; + + if (desc.destination().kind() == DestinationBuffer::kContiguous) { + Scalar* buffer = desc.destination().template data(); + desc.DropDestinationBuffer(); + return Storage(buffer, desc.dimensions(), + internal::strides(desc.dimensions()), + /*materialized_in_output=*/true, + /*strided_storage=*/false); + + } else if (desc.destination().kind() == DestinationBuffer::kStrided && + allow_strided_storage) { + Scalar* buffer = desc.destination().template data(); + desc.DropDestinationBuffer(); + return Storage(buffer, desc.dimensions(), desc.destination().strides(), + /*materialized_in_output=*/true, /*strided_storage=*/true); + + } else { + void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); + return Storage(static_cast(mem), desc.dimensions(), + internal::strides(desc.dimensions()), + /*materialized_in_output=*/false, + /*strided_storage=*/false); + } + } + + // Creates a materialized block for the given descriptor from a memory buffer. + template + EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize( + const Scalar* data, const DataDimensions& data_dims, + TensorBlockDesc& desc, TensorBlockScratch& scratch) { + eigen_assert(array_size::value == desc.dimensions().size()); + + // If a tensor block dimensions covers a contiguous block of the underlying + // memory, we can skip block buffer memory allocation, and construct a block + // from existing `data` memory buffer. + // + // Example: (RowMajor layout) + // data_dims: [11, 12, 13, 14] + // desc.dimensions(): [1, 1, 3, 14] + // + // In this case we can construct a TensorBlock starting at + // `data + desc.offset()`, with a `desc.dimensions()` block sizes. + static const bool is_col_major = Layout == ColMajor; + + // Find out how many inner dimensions have a matching size. + int num_matching_inner_dims = 0; + for (int i = 0; i < NumDims; ++i) { + int dim = is_col_major ? i : NumDims - i - 1; + if (data_dims[dim] != desc.dimensions()[dim]) break; + ++num_matching_inner_dims; + } + + // All the outer dimensions must be of size `1`, except a single dimension + // before the matching inner dimension (`3` in the example above). + bool can_use_direct_access = true; + for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) { + int dim = is_col_major ? i : NumDims - i - 1; + if (desc.dimension(dim) != 1) { + can_use_direct_access = false; + break; + } + } + + if (can_use_direct_access) { + const Scalar* block_start = data + desc.offset(); + return TensorMaterializedBlock(internal::TensorBlockKind::kView, + block_start, desc.dimensions()); + + } else { + // Reuse destination buffer or allocate new buffer with scratch allocator. + const Storage storage = prepareStorage(desc, scratch); + + typedef internal::TensorBlockIO + TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + TensorBlockIOSrc src(internal::strides(Dimensions(data_dims)), + data, desc.offset()); + TensorBlockIODst dst(storage.dimensions(), storage.strides(), + storage.data()); + + TensorBlockIO::Copy(dst, src); + return storage.AsTensorMaterializedBlock(); + } + } + + private: + TensorBlockKind m_kind; + const Scalar* m_data; + Dimensions m_dimensions; + XprType m_expr; + bool m_valid_expr; +}; + +// -------------------------------------------------------------------------- // +// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp +// functor to the blocks produced by the underlying Tensor expression. + +template +class TensorCwiseUnaryBlock { + + static const bool NoArgBlockAccess = + internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + TensorCwiseUnaryOp >:: + type XprType; + + typedef typename XprScalar::type Scalar; + + TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor) + : m_arg_block(arg_block), m_functor(functor) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + + XprType expr() const { return XprType(m_arg_block.expr(), m_functor); } + const Scalar* data() const { return NULL; } + void cleanup() { m_arg_block.cleanup(); } + + private: + ArgTensorBlock m_arg_block; + UnaryOp m_functor; +}; + +// -------------------------------------------------------------------------- // +// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp +// functor to the blocks produced by the underlying Tensor expression. + +template +class TensorCwiseBinaryBlock { + + static const bool NoArgBlockAccess = + internal::is_void::value || + internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + TensorCwiseBinaryOp >::type + XprType; + + typedef typename XprScalar::type Scalar; + + TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, + const RhsTensorBlock& right_block, + const BinaryOp& functor) + : m_left_block(left_block), + m_right_block(right_block), + m_functor(functor) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + + XprType expr() const { + return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); + } + + const Scalar* data() const { return NULL; } + + void cleanup() { + m_left_block.cleanup(); + m_right_block.cleanup(); + } + + private: + LhsTensorBlock m_left_block; + RhsTensorBlock m_right_block; + BinaryOp m_functor; +}; + +// -------------------------------------------------------------------------- // +// TensorUnaryExprBlock is a lazy tensor expression block that can construct +// an arbitrary tensor expression from a block of the underlying type (this is a +// generalization of the TensorCwiseUnaryBlock for arbitrary expressions). + +template +class TensorUnaryExprBlock { + + typedef typename ArgTensorBlock::XprType ArgXprType; + static const bool NoArgBlockAccess = internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + typename BlockFactory::template XprType::type>::type XprType; + + typedef typename XprScalar::type Scalar; + + TensorUnaryExprBlock(const ArgTensorBlock& arg_block, + const BlockFactory& factory) + : m_arg_block(arg_block), m_factory(factory) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + XprType expr() const { return m_factory.expr(m_arg_block.expr()); } + const Scalar* data() const { return NULL; } + void cleanup() { m_arg_block.cleanup(); } + + private: + ArgTensorBlock m_arg_block; + BlockFactory m_factory; +}; + +// -------------------------------------------------------------------------- // +// TensorTernaryExprBlock is a lazy tensor expression block that can construct +// an arbitrary tensor expression from three blocks of the underlying type. + +template +class TensorTernaryExprBlock { + + typedef typename Arg1TensorBlock::XprType Arg1XprType; + typedef typename Arg2TensorBlock::XprType Arg2XprType; + typedef typename Arg3TensorBlock::XprType Arg3XprType; + + static const bool NoArgBlockAccess = internal::is_void::value || + internal::is_void::value || + internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + typename BlockFactory::template XprType::type>::type XprType; + + typedef typename XprScalar::type Scalar; + + TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, + const Arg2TensorBlock& arg2_block, + const Arg3TensorBlock& arg3_block, + const BlockFactory& factory) + : m_arg1_block(arg1_block), + m_arg2_block(arg2_block), + m_arg3_block(arg3_block), + m_factory(factory) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + XprType expr() const { + return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), + m_arg3_block.expr()); + } + const Scalar* data() const { return NULL; } + void cleanup() { + m_arg1_block.cleanup(); + m_arg2_block.cleanup(); + m_arg3_block.cleanup(); + } + + private: + Arg1TensorBlock m_arg1_block; + Arg2TensorBlock m_arg2_block; + Arg3TensorBlock m_arg3_block; + BlockFactory m_factory; +}; + +// -------------------------------------------------------------------------- // +// StridedLinearBufferCopy provides a method to copy data between two linear +// buffers with different strides, with optimized paths for scatter/gather. + +template +class StridedLinearBufferCopy { + typedef typename packet_traits::type Packet; + enum { + Vectorizable = packet_traits::Vectorizable, + PacketSize = packet_traits::size + }; + + public: + // Specifying linear copy kind statically gives ~30% speedup for small sizes. + enum Kind { + Linear = 0, // src_stride == 1 && dst_stride == 1 + Scatter = 1, // src_stride == 1 && dst_stride != 1 + FillLinear = 2, // src_stride == 0 && dst_stride == 1 + FillScatter = 3, // src_stride == 0 && dst_stride != 1 + Gather = 4, // dst_stride == 1 + Random = 5 // everything else + }; + + struct Dst { + Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {} + + IndexType offset; + IndexType stride; + Scalar* data; + }; + + struct Src { + Src(IndexType o, IndexType s, const Scalar* d) + : offset(o), stride(s), data(d) {} + + IndexType offset; + IndexType stride; + const Scalar* data; + }; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, + const Src& src, + const size_t count) { + Run(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, + src.data); + } + + private: + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const IndexType count, const IndexType dst_offset, + const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data, + const IndexType src_offset, const IndexType src_stride, + const Scalar* EIGEN_RESTRICT src_data) { + const Scalar* src = &src_data[src_offset]; + Scalar* dst = &dst_data[dst_offset]; + + if (!Vectorizable) { + for (Index i = 0; i < count; ++i) { + dst[i * dst_stride] = src[i * src_stride]; + } + return; + } + + const IndexType vectorized_size = count - PacketSize; + IndexType i = 0; + + if (kind == Linear) { + // ******************************************************************** // + // Linear copy from `src` to `dst`. + const IndexType unrolled_size = count - 4 * PacketSize; + eigen_assert(src_stride == 1 && dst_stride == 1); + for (; i <= unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + Packet p = ploadu(src + i + j * PacketSize); + pstoreu(dst + i + j * PacketSize, p); + } + } + for (; i <= vectorized_size; i += PacketSize) { + Packet p = ploadu(src + i); + pstoreu(dst + i, p); + } + for (; i < count; ++i) { + dst[i] = src[i]; + } + // ******************************************************************** // + } else if (kind == Scatter) { + // Scatter from `src` to `dst`. + eigen_assert(src_stride == 1 && dst_stride != 1); + for (; i <= vectorized_size; i += PacketSize) { + Packet p = ploadu(src + i); + pscatter(dst + i * dst_stride, p, dst_stride); + } + for (; i < count; ++i) { + dst[i * dst_stride] = src[i]; + } + // ******************************************************************** // + } else if (kind == FillLinear) { + // Fill `dst` with value at `*src`. + eigen_assert(src_stride == 0 && dst_stride == 1); + const IndexType unrolled_size = count - 4 * PacketSize; + Packet p = pload1(src); + for (; i <= unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + pstoreu(dst + i + j * PacketSize, p); + } + } + for (; i <= vectorized_size; i += PacketSize) { + pstoreu(dst + i, p); + } + for (; i < count; ++i) { + dst[i] = *src; + } + // ******************************************************************** // + } else if (kind == FillScatter) { + // Scatter `*src` into `dst`. + eigen_assert(src_stride == 0 && dst_stride != 1); + Packet p = pload1(src); + for (; i <= vectorized_size; i += PacketSize) { + pscatter(dst + i * dst_stride, p, dst_stride); + } + for (; i < count; ++i) { + dst[i * dst_stride] = *src; + } + // ******************************************************************** // + } else if (kind == Gather) { + // Gather from `src` into `dst`. + eigen_assert(dst_stride == 1); + for (; i <= vectorized_size; i += PacketSize) { + Packet p = pgather(src + i * src_stride, src_stride); + pstoreu(dst + i, p); + } + for (; i < count; ++i) { + dst[i] = src[i * src_stride]; + } + // ******************************************************************** // + } else if (kind == Random) { + // Random. + for (; i < count; ++i) { + dst[i * dst_stride] = src[i * src_stride]; + } + } else { + eigen_assert(false); + } + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block. +// It's possible to specify src->dst dimension mapping for the copy operation. +// Dimensions of `dst` specify how many elements have to be copied, for the +// `src` we need to know only stride to navigate through source memory buffer. + +template +class TensorBlockIO { + static const bool IsColMajor = (Layout == ColMajor); + + typedef StridedLinearBufferCopy LinCopy; + + public: + typedef DSizes Dimensions; + typedef DSizes DimensionsMap; + + struct Dst { + Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, + IndexType dst_offset = 0) + : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} + + Dimensions dims; + Dimensions strides; + Scalar* data; + IndexType offset; + }; + + struct Src { + Src(const Dimensions& src_strides, const Scalar* src, + IndexType src_offset = 0) + : strides(src_strides), data(src), offset(src_offset) {} + + Dimensions strides; + const Scalar* data; + IndexType offset; + }; + + // Copies data to `dst` from `src`, using provided dimensions mapping: + // + // src_dimension_index = dst_to_src_dim_map[dst_dimension_index] + // + // Returns the number of copied elements. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy( + const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) { + // Copy single scalar value from `src` to `dst`. + if (NumDims == 0) { + *(dst.data + dst.offset) = *(src.data + src.offset); + return 1; + } + + // Both `dst` and `src` must have contiguous innermost dimension. We also + // accept the special case with stride '0', because it's used as a trick to + // implement broadcasting. + { + int inner_dim = IsColMajor ? 0 : NumDims - 1; + EIGEN_UNUSED_VARIABLE(inner_dim); + eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0); + eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0); + } + + // Give a shorter name to `dst_to_src_dim_map`. + const DimensionsMap& dim_map = dst_to_src_dim_map; + + // Do not squeeze reordered inner dimensions. + int num_squeezable_dims = NumSqueezableInnerDims(dim_map); + + // NOTE: We find the innermost dimension (contiguous in memory) in the dst + // block, and we write data linearly into that dimension, reading it from + // the src. If dimensions are reordered, we might end up reading data from + // the src with `stride != 1`. + // + // NOTE: Random-Read/Linear-Write can be up to ~2X faster than + // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680 + + // Find the innermost dimension in the dst whose size is not 1. This is the + // effective inner dim. + int num_size_one_inner_dims = 0; + for (int i = 0; i < num_squeezable_dims; ++i) { + const int dst_dim = IsColMajor ? i : NumDims - i - 1; + if (dst.dims[dst_dim] != 1) break; + num_size_one_inner_dims++; + } + + // If all dimensions are of size 1, just copy a scalar from `src` to `dst`. + if (num_size_one_inner_dims == NumDims) { + *(dst.data + dst.offset) = *(src.data + src.offset); + return 1; + } + + // Outermost dimension in the dst with `stride == 1` (contiguous in memory). + const int dst_stride1_dim = IsColMajor + ? num_size_one_inner_dims + : NumDims - num_size_one_inner_dims - 1; + + // Dimension in the src that corresponds to the dst innermost dimension. + const int src_dim_for_dst_stride1_dim = + NumDims == 0 ? 1 : dim_map[dst_stride1_dim]; + + // Size of the innermost dimension (length of contiguous blocks of memory). + IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim]; + + // Squeeze multiple inner dims into one if they are contiguous in `dst` and + // `src` memory, so we can do less linear copy calls. + for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) { + const int dst_dim = IsColMajor ? i : NumDims - i - 1; + const IndexType dst_stride = dst.strides[dst_dim]; + const IndexType src_stride = src.strides[dim_map[dst_dim]]; + if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) { + dst_inner_dim_size *= dst.dims[dst_dim]; + ++num_size_one_inner_dims; + } else { + break; + } + } + + // Setup strides to read data from `src` and write to `dst`. + IndexType input_offset = src.offset; + IndexType output_offset = dst.offset; + IndexType input_stride = + NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim]; + IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim]; + + const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; + array it; + + // Initialize block iterator state. Squeeze away any dimension of size 1. + int idx = 0; // currently initialized iterator state index + for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { + const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2; + if (dst.dims[dst_dim] == 1) continue; + + it[idx].size = dst.dims[dst_dim]; + it[idx].input_stride = src.strides[dim_map[dst_dim]]; + it[idx].output_stride = dst.strides[dst_dim]; + + it[idx].input_span = it[idx].input_stride * (it[idx].size - 1); + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + + idx++; + } + + // Iterate copying data from src to dst. + const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); + +#define COPY_INNER_DIM(KIND) \ + IndexType num_copied = 0; \ + for (num_copied = 0; num_copied < block_total_size; \ + num_copied += dst_inner_dim_size) { \ + LinCopy::template Run( \ + typename LinCopy::Dst(output_offset, output_stride, dst.data), \ + typename LinCopy::Src(input_offset, input_stride, src.data), \ + dst_inner_dim_size); \ + \ + for (int j = 0; j < idx; ++j) { \ + if (++it[j].count < it[j].size) { \ + input_offset += it[j].input_stride; \ + output_offset += it[j].output_stride; \ + break; \ + } \ + it[j].count = 0; \ + input_offset -= it[j].input_span; \ + output_offset -= it[j].output_span; \ + } \ + } \ + return num_copied; + + if (input_stride == 1 && output_stride == 1) { + COPY_INNER_DIM(LinCopy::Linear); + } else if (input_stride == 1 && output_stride != 1) { + COPY_INNER_DIM(LinCopy::Scatter); + } else if (input_stride == 0 && output_stride == 1) { + COPY_INNER_DIM(LinCopy::FillLinear); + } else if (input_stride == 0 && output_stride != 1) { + COPY_INNER_DIM(LinCopy::FillScatter); + } else if (output_stride == 1) { + COPY_INNER_DIM(LinCopy::Gather); + } else { + COPY_INNER_DIM(LinCopy::Random); + } + +#undef COPY_INNER_DIM + } + + // Copy from `src` to `dst` with an identity src->dst dimension map. Returns + // the number of copied elements. + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, + const Src& src) { + DimensionsMap dst_to_src_map; + for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i; + return Copy(dst, src, dst_to_src_map); + } + + private: + struct BlockIteratorState { + BlockIteratorState() + : size(0), + count(0), + input_stride(0), + output_stride(0), + input_span(0), + output_span(0) {} + + IndexType size; + IndexType count; + IndexType input_stride; + IndexType output_stride; + IndexType input_span; + IndexType output_span; + }; + + // Compute how many inner dimensions it's allowed to squeeze when doing IO + // between two tensor blocks. It's safe to squeeze inner dimensions, only + // if they are not reordered. + static int NumSqueezableInnerDims(const DimensionsMap& dim_map) { + int num_squeezable_dims = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + if (dim_map[dim] != dim) break; + num_squeezable_dims++; + } + return num_squeezable_dims; + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to +// a Tensor block defined by `desc`, backed by a memory buffer at `target`. +// +// Currently there is no way to write from a Tensor expression to a block of +// memory, if dimensions are reordered. If you need to do that, you should +// materialize a Tensor block expression into a memory buffer, and then use +// TensorBlockIO to copy data between two memory buffers with a custom +// `target->src` dimension map (see definition above). +// +// Also currently the innermost dimension of `target` must have a stride '1' +// (contiguous in memory). This restriction could be lifted with a `pscatter`, +// but in practice it's never needed, and there is a similar TensorBlockIO +// workaround for that. +// +// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO +// where `src` is a tensor expression. Explore if it is possible to rewrite IO +// to use expressions instead of pointers, and after that TensorBlockAssignment +// will become an alias to IO. +template +class TensorBlockAssignment { + // We will use coeff/packet path to evaluate block expressions. + typedef TensorEvaluator + TensorBlockEvaluator; + + typedef DSizes Dimensions; + + enum { + Vectorizable = packet_traits::Vectorizable, + PacketSize = packet_traits::size + }; + + template + struct InnerDimAssign { + EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, + const Evaluator& eval, + IndexType eval_offset) { + for (IndexType i = 0; i < count; ++i) { + target[i] = eval.coeff(eval_offset + i); + } + } + }; + + template + struct InnerDimAssign { + EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, + const Evaluator& eval, + IndexType eval_offset) { + typedef typename packet_traits::type Packet; + + const IndexType unrolled_size = count - 4 * PacketSize; + const IndexType vectorized_size = count - PacketSize; + IndexType i = 0; + + for (; i <= unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + const IndexType idx = eval_offset + i + j * PacketSize; + Packet p = eval.template packet(idx); + pstoreu(target + i + j * PacketSize, p); + } + } + + for (; i <= vectorized_size; i += PacketSize) { + Packet p = eval.template packet(eval_offset + i); + pstoreu(target + i, p); + } + + for (; i < count; ++i) { + target[i] = eval.coeff(eval_offset + i); + } + } + }; + + public: + struct Target { + Target(const Dimensions& target_dims, const Dimensions& target_strides, + Scalar* target_data, IndexType target_offset = 0) + : dims(target_dims), + strides(target_strides), + data(target_data), + offset(target_offset) {} + + Dimensions dims; + Dimensions strides; + Scalar* data; + IndexType offset; + }; + + static Target target(const Dimensions& target_dims, + const Dimensions& target_strides, Scalar* target_data, + IndexType target_offset = 0) { + return Target(target_dims, target_strides, target_data, target_offset); + } + + template + static Target target( + const DSizes& target_dims, + const DSizes& target_strides, + Scalar* target_data, IndexType target_offset = 0) { + // DSizes constructor will do index type promotion if it's safe. + return Target(Dimensions(target_dims), Dimensions(target_strides), + target_data, target_offset); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Target& target, const TensorBlockExpr& expr) { + // Prepare evaluator for block expression. + DefaultDevice default_device; + TensorBlockEvaluator eval(expr, default_device); + + // Tensor block expression dimension should match destination dimensions. + eigen_assert(dimensions_match(target.dims, eval.dimensions())); + + static const int Layout = TensorBlockEvaluator::Layout; + static const bool is_col_major = Layout == ColMajor; + + // Initialize output inner dimension size based on a layout. + const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize(); + const int inner_dim_idx = is_col_major ? 0 : NumDims - 1; + IndexType output_inner_dim_size = target.dims[inner_dim_idx]; + + // Target inner dimension stride must be '1'. + eigen_assert(target.strides[inner_dim_idx] == 1); + + // Squeeze multiple inner dims into one if they are contiguous in `target`. + IndexType num_squeezed_dims = 0; + for (Index i = 1; i < NumDims; ++i) { + const Index dim = is_col_major ? i : NumDims - i - 1; + const IndexType target_stride = target.strides[dim]; + + if (output_inner_dim_size == target_stride) { + output_inner_dim_size *= target.dims[dim]; + num_squeezed_dims++; + } else { + break; + } + } + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + + int idx = 0; // currently initialized iterator state index + for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) { + const Index dim = is_col_major ? i + 1 : NumDims - i - 2; + + it[idx].count = 0; + it[idx].size = target.dims[dim]; + it[idx].output_stride = target.strides[dim]; + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + idx++; + } + + // We read block expression from the beginning, and start writing data to + // `target` at given offset. + IndexType input_offset = 0; + IndexType output_offset = target.offset; + + // Iterate copying data from `eval` to `target`. + for (IndexType i = 0; i < output_size; i += output_inner_dim_size) { + // Assign to `target` at current offset. + InnerDimAssign::Run(target.data + output_offset, + output_inner_dim_size, eval, + input_offset); + + // Move input offset forward by the number of assigned coefficients. + input_offset += output_inner_dim_size; + + // Update index. + for (int j = 0; j < idx; ++j) { + if (++it[j].count < it[j].size) { + output_offset += it[j].output_stride; + break; + } + it[j].count = 0; + output_offset -= it[j].output_span; + } + } + } + + private: + struct BlockIteratorState { + BlockIteratorState() + : count(0), size(0), output_stride(0), output_span(0) {} + + IndexType count; + IndexType size; + IndexType output_stride; + IndexType output_span; + }; +}; + +// -------------------------------------------------------------------------- // + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h deleted file mode 100644 index 029180ca5..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h +++ /dev/null @@ -1,1481 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H -#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H - -namespace Eigen { -namespace internal { - -// -------------------------------------------------------------------------- // -// Forward declarations for templates defined below. -template -class TensorBlockIOV2; - -// -------------------------------------------------------------------------- // -// Helper function to compute strides for densely stored buffer of given -// dimensions. - -// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use -// this function instead everywhere. -template -EIGEN_ALWAYS_INLINE DSizes strides( - const DSizes& dimensions) { - DSizes strides; - if (NumDims == 0) return strides; - - // TODO(ezhulenev): Use templates to unroll this loop (similar to - // h_array_reduce in CXX11meta.h)? Benchmark it. - if (static_cast(Layout) == static_cast(ColMajor)) { - strides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - strides[i] = strides[i - 1] * dimensions[i - 1]; - } - } else { - strides[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * dimensions[i + 1]; - } - } - - return strides; -} - -template -EIGEN_ALWAYS_INLINE DSizes strides( - const Eigen::array& dimensions) { - return strides(DSizes(dimensions)); -} - -template -EIGEN_STRONG_INLINE DSizes strides( - const Sizes& sizes) { - return strides(DSizes(sizes)); -} - -// -------------------------------------------------------------------------- // - -// Tensor block shape type defines what are the shape preference for the blocks -// extracted from the larger tensor. -// -// Example: blocks of 100 elements from the large 100x100 tensor: -// - tensor: 100x100 -// - target_block_size: 100 -// -// TensorBlockShapeType: -// - kUniformAllDims: 100 blocks of size 10x10 -// - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column -// or row major layout) -enum class TensorBlockV2ShapeType { kUniformAllDims, kSkewedInnerDims }; - -struct TensorBlockV2ResourceRequirements { - TensorBlockV2ShapeType shape_type; - size_t size; - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE TensorBlockV2ResourceRequirements - merge(const TensorBlockV2ResourceRequirements &lhs, - const TensorBlockV2ResourceRequirements &rhs) { - return {merge(lhs.shape_type, rhs.shape_type), merge(rhs.size, lhs.size)}; - } - - // This is a resource requirement that should be returned from expressions - // that do not have any block evaluation preference (e.g. default tensor - // expression with raw buffer access). - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE TensorBlockV2ResourceRequirements any() { - return {TensorBlockV2ShapeType::kUniformAllDims, 1}; - } - -private: - using Requirements = TensorBlockV2ResourceRequirements; - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) { - return numext::maxi(lhs_size, rhs_size); - } - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE TensorBlockV2ShapeType merge(TensorBlockV2ShapeType lhs, - TensorBlockV2ShapeType rhs) { - return (lhs == TensorBlockV2ShapeType::kSkewedInnerDims || - rhs == TensorBlockV2ShapeType::kSkewedInnerDims) - ? TensorBlockV2ShapeType::kSkewedInnerDims - : TensorBlockV2ShapeType::kUniformAllDims; - } -}; - -// -------------------------------------------------------------------------- // -// TensorBlockDescriptor specifies a block offset within a tensor and the block -// sizes along each of the tensor dimensions. - -template -class TensorBlockDescriptor { - public: - typedef DSizes Dimensions; - - // If we evaluate a Tensor assignment, and expression on the left, already has - // a memory buffer, then we might do performance optimization, and evaluate - // the root expression directly into the final output memory. Some time it's - // possible to reuse it for materializing subexpressions inside an expression - // tree, to to avoid dynamic memory allocation. - // - // The pointer type of the underlying storage is erased, because passing - // Scalar type through all the expression evaluation layers is way too many - // templates. In practice destination buffer type should always match the - // evaluated expression scalar type. - class DestinationBuffer { - public: - enum DestinationBufferKind : int { - // The above explicit specification of "int" as the enum basetype is needed - // to get around a HIPCC link error ("the field type is not amp-compatible") - // which is issued for class members with the enum type. - // TODO(rocm): - // remove the "int" basetype once HIPCC has been fixed to not error out - // in the above scenario. - - // Destination buffer is not defined (`m_data` == nullptr). - kEmpty, - - // Tensor block defined by an owning tensor block descriptor can fit - // contiguously into the destination buffer. In this case it's safe to - // materialize tensor block in the destination buffer, wrap it in a - // TensorMap, and use to build Eigen expression on top of it. - kContiguous, - - // Destination buffer strides do not match strides of the contiguously - // stored block, and it's impossible to define a TensorMap over this - // buffer. However if we are evaluating a root of an expression tree, we - // still can materialize an output into this destination, because we can - // guarantee that no one will ever access it through block API. - // - // In theory it is possible to build valid TensorStriding - // expression on top of this destination buffer, however it has - // inefficient coeff/packet access, and defeats the purpose of fast block - // evaluation API. - kStrided - }; - - template - Scalar* data() const { - eigen_assert(m_data_type_size == sizeof(Scalar)); - return static_cast(m_data); - } - - const Dimensions& strides() const { return m_strides; } - const DestinationBufferKind& kind() const { return m_kind; } - - private: - friend class TensorBlockDescriptor; - - DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {} - - template - DestinationBuffer(Scalar* data, const Dimensions& strides, - DestinationBufferKind kind) - : m_data(static_cast(data)), - m_data_type_size(sizeof(Scalar)), - m_strides(strides), - m_kind(kind) {} - - template - static DestinationBuffer make(const TensorBlockDescriptor& desc, - Scalar* data, const Dimensions& strides) { - return DestinationBuffer(data, strides, kind(desc, strides)); - } - - template - static DestinationBufferKind kind(const TensorBlockDescriptor& desc, - const Dimensions& strides) { - const Dimensions& desc_dims = desc.dimensions(); - const Dimensions& desc_strides = internal::strides(desc_dims); - for (int i = 0; i < NumDims; ++i) { - if (desc_dims[i] == 1) continue; - if (desc_strides[i] != strides[i]) return kStrided; - } - return kContiguous; - } - - // Storage pointer is type erased, to reduce template bloat, but we still - // keep the size of the underlying element type for error checking. - void* m_data; - size_t m_data_type_size; - - // Destination buffer dimensions always match the dimensions of a tensor - // block descriptor it belongs to, however strides might be different. - Dimensions m_strides; - - DestinationBufferKind m_kind; - }; - - TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, - const DestinationBuffer& destination) - : m_offset(offset), - m_dimensions(dimensions), - m_destination(destination) {} - - TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions) - : m_offset(offset), - m_dimensions(dimensions), - m_destination(DestinationBuffer()) {} - - IndexType offset() const { return m_offset; } - const Dimensions& dimensions() const { return m_dimensions; } - IndexType dimension(int index) const { return m_dimensions[index]; } - IndexType size() const { return array_prod(m_dimensions); } - - const DestinationBuffer& destination() const { return m_destination; } - - template - void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) { - eigen_assert(dst_base != NULL); - m_destination = - DestinationBuffer::template make(*this, dst_base, dst_strides); - } - - template - void AddDestinationBuffer( - Scalar* dst_base, - const DSizes& dst_strides) { - // DSizes constructor will do index type promotion if it's safe. - AddDestinationBuffer(dst_base, Dimensions(dst_strides)); - } - - TensorBlockDescriptor& DropDestinationBuffer() { - m_destination.m_data = NULL; - m_destination.m_kind = DestinationBuffer::kEmpty; - return *this; - } - - bool HasDestinationBuffer() const { - return m_destination.kind() != DestinationBuffer::kEmpty; - } - - // Returns a copy of `*this` with updated offset. - TensorBlockDescriptor WithOffset(IndexType offset) const { - return TensorBlockDescriptor(offset, m_dimensions, m_destination); - } - - private: - // Offset and dimensions are immutable after construction. Block descriptor - // can only be mutated by adding or dropping destination. - const IndexType m_offset; - const Dimensions m_dimensions; - DestinationBuffer m_destination; -}; - -// -------------------------------------------------------------------------- // -// TensorBlockMapper is responsible for iterating over the blocks of a tensor. - -template -class TensorBlockV2Mapper { - typedef TensorBlockDescriptor BlockDescriptor; - - public: - typedef DSizes Dimensions; - - TensorBlockV2Mapper() = default; - TensorBlockV2Mapper(const DSizes& dimensions, - const TensorBlockV2ResourceRequirements& requirements) - : m_tensor_dimensions(dimensions), m_requirements(requirements) { - // Initialize `m_block_dimensions`. - InitializeBlockDimensions(); - - // Calculate block counts by dimension and total block count. - DSizes block_count; - for (int i = 0; i < NumDims; ++i) { - block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]); - } - m_total_block_count = array_prod(block_count); - - // Calculate block strides (used for enumerating blocks). - m_tensor_strides = strides(m_tensor_dimensions); - m_block_strides = strides(block_count); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const { - return m_total_block_count; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const { - return m_block_dimensions.TotalSize(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& - blockDimensions() const { - return m_block_dimensions; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - BlockDescriptor blockDescriptor(IndexType block_index) const { - static const bool isColMajor = Layout == static_cast(ColMajor); - - IndexType offset = 0; - DSizes dimensions; - - if (NumDims == 0) return BlockDescriptor(offset, dimensions); - - // Iterate outer -> inner dimensions. - for (int i = NumDims - 1; i >= 0; --i) { - const int dim = isColMajor ? i : NumDims - i - 1; - - const IndexType idx = block_index / m_block_strides[dim]; - block_index -= idx * m_block_strides[dim]; - - const IndexType coord = idx * m_block_dimensions[dim]; - dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, - m_block_dimensions[dim]); - offset += coord * m_tensor_strides[dim]; - } - - return {offset, dimensions}; - } - - private: - void InitializeBlockDimensions() { - // Requested block shape and size. - const TensorBlockV2ShapeType shape_type = m_requirements.shape_type; - const IndexType target_block_size = - numext::maxi(1, static_cast(m_requirements.size)); - - // Corner case: one of the dimensions is zero. Logic below is too complex - // to handle this case on a general basis, just use unit block size. - // Note: we must not yield blocks with zero dimensions (recipe for - // overflows/underflows, divisions by zero and NaNs later). - if (m_tensor_dimensions.TotalSize() == 0) { - for (int i = 0; i < NumDims; ++i) { - m_block_dimensions[i] = 1; - } - return; - } - - // If tensor fits into a target block size, evaluate it as a single block. - if (m_tensor_dimensions.TotalSize() <= target_block_size) { - m_block_dimensions = m_tensor_dimensions; - return; - } - - static const bool isColMajor = Layout == static_cast(ColMajor); - - // Block shape skewed towards inner dimension. - if (shape_type == TensorBlockV2ShapeType::kSkewedInnerDims) { - IndexType coeff_to_allocate = target_block_size; - - for (int i = 0; i < NumDims; ++i) { - const int dim = isColMajor ? i : NumDims - i - 1; - m_block_dimensions[dim] = - numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]); - coeff_to_allocate = divup( - coeff_to_allocate, - numext::maxi(static_cast(1), m_block_dimensions[dim])); - } - eigen_assert(coeff_to_allocate == 1); - - } else if (shape_type == TensorBlockV2ShapeType::kUniformAllDims) { - // Tensor will not fit within 'target_block_size' budget: calculate tensor - // block dimension sizes based on "square" dimension size target. - const IndexType dim_size_target = convert_index( - std::pow(static_cast(target_block_size), - 1.0f / static_cast(m_block_dimensions.rank()))); - - for (int i = 0; i < NumDims; ++i) { - // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it - // a multiple of the packet size. Note that reducing - // 'block_dim_size' in this manner can increase the number of - // blocks, and so will amplify any per-block overhead. - m_block_dimensions[i] = - numext::mini(dim_size_target, m_tensor_dimensions[i]); - } - - // Add any un-allocated coefficients to inner dimension(s). - IndexType total_size = m_block_dimensions.TotalSize(); - for (int i = 0; i < NumDims; ++i) { - const int dim = isColMajor ? i : NumDims - i - 1; - - if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) { - const IndexType total_size_other_dims = - total_size / m_block_dimensions[dim]; - const IndexType alloc_avail = - divup(target_block_size, total_size_other_dims); - if (alloc_avail == m_block_dimensions[dim]) { - // Insufficient excess coefficients to allocate. - break; - } - m_block_dimensions[dim] = - numext::mini(m_tensor_dimensions[dim], alloc_avail); - total_size = total_size_other_dims * m_block_dimensions[dim]; - } - } - - } else { - eigen_assert(false); // unknown block shape - } - - eigen_assert(m_block_dimensions.TotalSize() >= - numext::mini(target_block_size, - m_tensor_dimensions.TotalSize())); - } - - DSizes m_tensor_dimensions; - TensorBlockV2ResourceRequirements m_requirements; - - DSizes m_block_dimensions; - IndexType m_total_block_count; - - DSizes m_tensor_strides; - DSizes m_block_strides; -}; - -// -------------------------------------------------------------------------- // -// TensorBlockScratchAllocator is responsible for allocating temporary buffers -// for block evaluation (output or input block materialization). Given that -// Eigen expression traversal order is deterministic, all temporary allocations -// are happening in the same order, and usually have exactly the same size. -// Scratch allocator keeps a trace of all dynamic allocations, and after the -// first block evaluation is completed, we should be able to reuse all the -// temporary buffers for the next block evaluation. - -template -class TensorBlockScratchAllocator { - public: - explicit TensorBlockScratchAllocator(const Device& device) - : m_device(device), m_allocation_index(0) {} - - ~TensorBlockScratchAllocator() { - for (size_t i = 0; i < m_allocations.size(); ++i) { - m_device.deallocate(m_allocations[i].ptr); - } - } - - void* allocate(size_t size) { - // TODO(ezhulenev): Remove when replaced with inlined vector. - if (m_allocations.capacity() == 0) m_allocations.reserve(8); - - // Check if we already have an existing allocation att current index. - const int num_allocations = static_cast(m_allocations.size()); - const bool has_allocation = m_allocation_index < num_allocations; - - // Allocation index can't be larger than the number of allocations. - eigen_assert(m_allocation_index <= num_allocations); - - // If we have existing allocation, and its size is larger or equal to - // requested size, we do nothing. - - // If current allocation can't fit requested size, we deallocate it, and - // replace with a larger allocation. - if (has_allocation && m_allocations[m_allocation_index].size < size) { - m_device.deallocate(m_allocations[m_allocation_index].ptr); - m_allocations[m_allocation_index].ptr = m_device.allocate(size); - m_allocations[m_allocation_index].size = size; - } - - // Make a new allocation if we don't have and existing one. - if (!has_allocation) { - Allocation allocation; - allocation.ptr = m_device.allocate(size); - allocation.size = size; - m_allocations.push_back(allocation); - } - - eigen_assert(m_allocations[m_allocation_index].ptr != NULL); - eigen_assert(m_allocations[m_allocation_index].size >= size); - - return m_allocations[m_allocation_index++].ptr; - } - - void reset() { m_allocation_index = 0; } - - private: - struct Allocation { - void* ptr; - size_t size; - }; - - const Device& m_device; - int m_allocation_index; - // TODO(ezhulenev): This should be an inlined vector. - std::vector m_allocations; -}; - -// -------------------------------------------------------------------------- // -// TensorBlockKind represents all possible block kinds, that can be produced by -// TensorEvaluator::evalBlock function. -enum TensorBlockKind { - // Tensor block that is a lazy expression that must be assigned to a - // destination using TensorBlockAssign. - kExpr, - - // Tensor block that is a view into a memory buffer owned by an underlying - // Tensor expression (e.g. it can be a view into a Tensor buffer). - kView, - - // Tensor block that was materialized in a scratch memory buffer, allocated - // with TensorBlockScratchAllocator. This block must be copied to a - // destination, similar to a block of `kExpr` type. - kMaterializedInScratch, - - // Tensor block that was materialized directly into the final output memory - // buffer. For example if the left side of an assignment is a Tensor, we can - // directly materialize the block in the destination memory. - // - // If strides in the output buffer do not match tensor block strides, the - // Tensor expression will be invalid, and should not be used by - // TensorBlockAssign or for constructing another block expression. - kMaterializedInOutput -}; - -// -------------------------------------------------------------------------- // -// TensorBlockNotImplemented should be used to defined TensorBlock typedef in -// TensorEvaluators that do not support block evaluation. - -class TensorBlockNotImplemented { - public: - typedef void XprType; -}; - -// -------------------------------------------------------------------------- // -// XprScalar extracts Scalar type from the Eigen expressions (if expression type -// is not void). It's required to be able to define lazy block expression for -// argument types, that do not support block evaluation. - -template -struct XprScalar { - typedef typename XprType::Scalar type; -}; -template <> -struct XprScalar { - typedef void type; -}; - -// -------------------------------------------------------------------------- // -// TensorMaterializedBlock is a fully evaluated block of the original tensor, -// and XprType is just a TensorMap over the data. This block type is typically -// used to materialize blocks of tensor expressions, that can't be efficiently -// represented as lazy Tensor expressions with fast coeff/packet operations, -// e.g. we materialize all broadcasts into evaluated blocks. -// -// TensorMaterializedBlock does not own its memory buffer, it's either a memory -// buffer that backs the original expression (e.g. block is just a view into a -// Tensor), or a memory buffer allocated with scratch allocator, and in this -// case the scratch allocator will deallocate it at the end of block based -// expression execution. -// -// If the block was evaluated directly into the output buffer, and strides in -// the output buffer do not match block strides, the TensorMap expression will -// be invalid, and should never be used in block assignment or any other tensor -// expression. - -template -class TensorMaterializedBlock { - public: - typedef DSizes Dimensions; - typedef TensorMap > XprType; - - TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, - const Dimensions& dimensions, bool valid_expr = true) - : m_kind(kind), - m_data(data), - m_dimensions(dimensions), - m_expr(m_data, m_dimensions), - m_valid_expr(valid_expr) { - eigen_assert(m_kind == internal::TensorBlockKind::kView || - m_kind == internal::TensorBlockKind::kMaterializedInScratch || - m_kind == internal::TensorBlockKind::kMaterializedInOutput); - } - - TensorBlockKind kind() const { return m_kind; } - // NOTE(ezhulenev): Returning XprType by value like in other block types - // causes asan failures. The theory is that XprType::Nested doesn't work - // properly for TensorMap. - const XprType& expr() const { - eigen_assert(m_valid_expr); - return m_expr; - } - const Scalar* data() const { return m_data; } - void cleanup() {} - - typedef internal::TensorBlockDescriptor TensorBlockDesc; - - // TensorMaterializedBlock can be backed by different types of storage: - // - // (1) Contiguous block of memory allocated with scratch allocator. - // (2) Contiguous block of memory reused from tensor block descriptor - // destination buffer. - // (3) Strided block of memory reused from tensor block descriptor - // destination buffer. - // - class Storage { - public: - Scalar* data() const { return m_data; } - const Dimensions& dimensions() const { return m_dimensions; } - const Dimensions& strides() const { return m_strides; } - - TensorMaterializedBlock AsTensorMaterializedBlock() const { - return TensorMaterializedBlock( - m_materialized_in_output - ? internal::TensorBlockKind::kMaterializedInOutput - : internal::TensorBlockKind::kMaterializedInScratch, - m_data, m_dimensions, !m_strided_storage); - } - - private: - friend class TensorMaterializedBlock; - - Storage(Scalar* data, const Dimensions& dimensions, - const Dimensions& strides, bool materialized_in_output, - bool strided_storage) - : m_data(data), - m_dimensions(dimensions), - m_strides(strides), - m_materialized_in_output(materialized_in_output), - m_strided_storage(strided_storage) {} - - Scalar* m_data; - Dimensions m_dimensions; - Dimensions m_strides; - bool m_materialized_in_output; - bool m_strided_storage; - }; - - // Creates a storage for materialized block either from the block descriptor - // destination buffer, or allocates a new buffer with scratch allocator. - template - EIGEN_STRONG_INLINE static Storage prepareStorage( - TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool allow_strided_storage = false) { - // Try to reuse destination as an output block buffer. - typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer; - - if (desc.destination().kind() == DestinationBuffer::kContiguous) { - Scalar* buffer = desc.destination().template data(); - desc.DropDestinationBuffer(); - return Storage(buffer, desc.dimensions(), - internal::strides(desc.dimensions()), - /*materialized_in_output=*/true, - /*strided_storage=*/false); - - } else if (desc.destination().kind() == DestinationBuffer::kStrided && - allow_strided_storage) { - Scalar* buffer = desc.destination().template data(); - desc.DropDestinationBuffer(); - return Storage(buffer, desc.dimensions(), desc.destination().strides(), - /*materialized_in_output=*/true, /*strided_storage=*/true); - - } else { - void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); - return Storage(static_cast(mem), desc.dimensions(), - internal::strides(desc.dimensions()), - /*materialized_in_output=*/false, - /*strided_storage=*/false); - } - } - - // Creates a materialized block for the given descriptor from a memory buffer. - template - EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize( - const Scalar* data, const DataDimensions& data_dims, - TensorBlockDesc& desc, TensorBlockScratch& scratch) { - eigen_assert(array_size::value == desc.dimensions().size()); - - // If a tensor block dimensions covers a contiguous block of the underlying - // memory, we can skip block buffer memory allocation, and construct a block - // from existing `data` memory buffer. - // - // Example: (RowMajor layout) - // data_dims: [11, 12, 13, 14] - // desc.dimensions(): [1, 1, 3, 14] - // - // In this case we can construct a TensorBlock starting at - // `data + desc.offset()`, with a `desc.dimensions()` block sizes. - static const bool is_col_major = Layout == ColMajor; - - // Find out how many inner dimensions have a matching size. - int num_matching_inner_dims = 0; - for (int i = 0; i < NumDims; ++i) { - int dim = is_col_major ? i : NumDims - i - 1; - if (data_dims[dim] != desc.dimensions()[dim]) break; - ++num_matching_inner_dims; - } - - // All the outer dimensions must be of size `1`, except a single dimension - // before the matching inner dimension (`3` in the example above). - bool can_use_direct_access = true; - for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) { - int dim = is_col_major ? i : NumDims - i - 1; - if (desc.dimension(dim) != 1) { - can_use_direct_access = false; - break; - } - } - - if (can_use_direct_access) { - const Scalar* block_start = data + desc.offset(); - return TensorMaterializedBlock(internal::TensorBlockKind::kView, - block_start, desc.dimensions()); - - } else { - // Reuse destination buffer or allocate new buffer with scratch allocator. - const Storage storage = prepareStorage(desc, scratch); - - typedef internal::TensorBlockIOV2 - TensorBlockIO; - typedef typename TensorBlockIO::Dst TensorBlockIODst; - typedef typename TensorBlockIO::Src TensorBlockIOSrc; - - TensorBlockIOSrc src(internal::strides(Dimensions(data_dims)), - data, desc.offset()); - TensorBlockIODst dst(storage.dimensions(), storage.strides(), - storage.data()); - - TensorBlockIO::Copy(dst, src); - return storage.AsTensorMaterializedBlock(); - } - } - - private: - TensorBlockKind m_kind; - const Scalar* m_data; - Dimensions m_dimensions; - XprType m_expr; - bool m_valid_expr; -}; - -// -------------------------------------------------------------------------- // -// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp -// functor to the blocks produced by the underlying Tensor expression. - -template -class TensorCwiseUnaryBlock { - - static const bool NoArgBlockAccess = - internal::is_void::value; - - public: - typedef typename conditional< - NoArgBlockAccess, void, - TensorCwiseUnaryOp >:: - type XprType; - - typedef typename XprScalar::type Scalar; - - TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor) - : m_arg_block(arg_block), m_functor(functor) {} - - TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } - - XprType expr() const { return XprType(m_arg_block.expr(), m_functor); } - const Scalar* data() const { return NULL; } - void cleanup() { m_arg_block.cleanup(); } - - private: - ArgTensorBlock m_arg_block; - UnaryOp m_functor; -}; - -// -------------------------------------------------------------------------- // -// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp -// functor to the blocks produced by the underlying Tensor expression. - -template -class TensorCwiseBinaryBlock { - - static const bool NoArgBlockAccess = - internal::is_void::value || - internal::is_void::value; - - public: - typedef typename conditional< - NoArgBlockAccess, void, - TensorCwiseBinaryOp >::type - XprType; - - typedef typename XprScalar::type Scalar; - - TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, - const RhsTensorBlock& right_block, - const BinaryOp& functor) - : m_left_block(left_block), - m_right_block(right_block), - m_functor(functor) {} - - TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } - - XprType expr() const { - return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); - } - - const Scalar* data() const { return NULL; } - - void cleanup() { - m_left_block.cleanup(); - m_right_block.cleanup(); - } - - private: - LhsTensorBlock m_left_block; - RhsTensorBlock m_right_block; - BinaryOp m_functor; -}; - -// -------------------------------------------------------------------------- // -// TensorUnaryExprBlock is a lazy tensor expression block that can construct -// an arbitrary tensor expression from a block of the underlying type (this is a -// generalization of the TensorCwiseUnaryBlock for arbitrary expressions). - -template -class TensorUnaryExprBlock { - - typedef typename ArgTensorBlock::XprType ArgXprType; - static const bool NoArgBlockAccess = internal::is_void::value; - - public: - typedef typename conditional< - NoArgBlockAccess, void, - typename BlockFactory::template XprType::type>::type XprType; - - typedef typename XprScalar::type Scalar; - - TensorUnaryExprBlock(const ArgTensorBlock& arg_block, - const BlockFactory& factory) - : m_arg_block(arg_block), m_factory(factory) {} - - TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } - XprType expr() const { return m_factory.expr(m_arg_block.expr()); } - const Scalar* data() const { return NULL; } - void cleanup() { m_arg_block.cleanup(); } - - private: - ArgTensorBlock m_arg_block; - BlockFactory m_factory; -}; - -// -------------------------------------------------------------------------- // -// TensorTernaryExprBlock is a lazy tensor expression block that can construct -// an arbitrary tensor expression from three blocks of the underlying type. - -template -class TensorTernaryExprBlock { - - typedef typename Arg1TensorBlock::XprType Arg1XprType; - typedef typename Arg2TensorBlock::XprType Arg2XprType; - typedef typename Arg3TensorBlock::XprType Arg3XprType; - - static const bool NoArgBlockAccess = internal::is_void::value || - internal::is_void::value || - internal::is_void::value; - - public: - typedef typename conditional< - NoArgBlockAccess, void, - typename BlockFactory::template XprType::type>::type XprType; - - typedef typename XprScalar::type Scalar; - - TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, - const Arg2TensorBlock& arg2_block, - const Arg3TensorBlock& arg3_block, - const BlockFactory& factory) - : m_arg1_block(arg1_block), - m_arg2_block(arg2_block), - m_arg3_block(arg3_block), - m_factory(factory) {} - - TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } - XprType expr() const { - return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), - m_arg3_block.expr()); - } - const Scalar* data() const { return NULL; } - void cleanup() { - m_arg1_block.cleanup(); - m_arg2_block.cleanup(); - m_arg3_block.cleanup(); - } - - private: - Arg1TensorBlock m_arg1_block; - Arg2TensorBlock m_arg2_block; - Arg3TensorBlock m_arg3_block; - BlockFactory m_factory; -}; - -// -------------------------------------------------------------------------- // -// StridedLinearBufferCopy provides a method to copy data between two linear -// buffers with different strides, with optimized paths for scatter/gather. - -template -class StridedLinearBufferCopy { - typedef typename packet_traits::type Packet; - enum { - Vectorizable = packet_traits::Vectorizable, - PacketSize = packet_traits::size - }; - - public: - // Specifying linear copy kind statically gives ~30% speedup for small sizes. - enum Kind { - Linear = 0, // src_stride == 1 && dst_stride == 1 - Scatter = 1, // src_stride == 1 && dst_stride != 1 - FillLinear = 2, // src_stride == 0 && dst_stride == 1 - FillScatter = 3, // src_stride == 0 && dst_stride != 1 - Gather = 4, // dst_stride == 1 - Random = 5 // everything else - }; - - struct Dst { - Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {} - - IndexType offset; - IndexType stride; - Scalar* data; - }; - - struct Src { - Src(IndexType o, IndexType s, const Scalar* d) - : offset(o), stride(s), data(d) {} - - IndexType offset; - IndexType stride; - const Scalar* data; - }; - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, - const Src& src, - const size_t count) { - Run(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, - src.data); - } - - private: - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const IndexType count, const IndexType dst_offset, - const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data, - const IndexType src_offset, const IndexType src_stride, - const Scalar* EIGEN_RESTRICT src_data) { - const Scalar* src = &src_data[src_offset]; - Scalar* dst = &dst_data[dst_offset]; - - if (!Vectorizable) { - for (Index i = 0; i < count; ++i) { - dst[i * dst_stride] = src[i * src_stride]; - } - return; - } - - const IndexType vectorized_size = count - PacketSize; - IndexType i = 0; - - if (kind == Linear) { - // ******************************************************************** // - // Linear copy from `src` to `dst`. - const IndexType unrolled_size = count - 4 * PacketSize; - eigen_assert(src_stride == 1 && dst_stride == 1); - for (; i <= unrolled_size; i += 4 * PacketSize) { - for (int j = 0; j < 4; ++j) { - Packet p = ploadu(src + i + j * PacketSize); - pstoreu(dst + i + j * PacketSize, p); - } - } - for (; i <= vectorized_size; i += PacketSize) { - Packet p = ploadu(src + i); - pstoreu(dst + i, p); - } - for (; i < count; ++i) { - dst[i] = src[i]; - } - // ******************************************************************** // - } else if (kind == Scatter) { - // Scatter from `src` to `dst`. - eigen_assert(src_stride == 1 && dst_stride != 1); - for (; i <= vectorized_size; i += PacketSize) { - Packet p = ploadu(src + i); - pscatter(dst + i * dst_stride, p, dst_stride); - } - for (; i < count; ++i) { - dst[i * dst_stride] = src[i]; - } - // ******************************************************************** // - } else if (kind == FillLinear) { - // Fill `dst` with value at `*src`. - eigen_assert(src_stride == 0 && dst_stride == 1); - const IndexType unrolled_size = count - 4 * PacketSize; - Packet p = pload1(src); - for (; i <= unrolled_size; i += 4 * PacketSize) { - for (int j = 0; j < 4; ++j) { - pstoreu(dst + i + j * PacketSize, p); - } - } - for (; i <= vectorized_size; i += PacketSize) { - pstoreu(dst + i, p); - } - for (; i < count; ++i) { - dst[i] = *src; - } - // ******************************************************************** // - } else if (kind == FillScatter) { - // Scatter `*src` into `dst`. - eigen_assert(src_stride == 0 && dst_stride != 1); - Packet p = pload1(src); - for (; i <= vectorized_size; i += PacketSize) { - pscatter(dst + i * dst_stride, p, dst_stride); - } - for (; i < count; ++i) { - dst[i * dst_stride] = *src; - } - // ******************************************************************** // - } else if (kind == Gather) { - // Gather from `src` into `dst`. - eigen_assert(dst_stride == 1); - for (; i <= vectorized_size; i += PacketSize) { - Packet p = pgather(src + i * src_stride, src_stride); - pstoreu(dst + i, p); - } - for (; i < count; ++i) { - dst[i] = src[i * src_stride]; - } - // ******************************************************************** // - } else if (kind == Random) { - // Random. - for (; i < count; ++i) { - dst[i * dst_stride] = src[i * src_stride]; - } - } else { - eigen_assert(false); - } - } -}; - -// -------------------------------------------------------------------------- // -// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block. -// It's possible to specify src->dst dimension mapping for the copy operation. -// Dimensions of `dst` specify how many elements have to be copied, for the -// `src` we need to know only stride to navigate through source memory buffer. - -template -class TensorBlockIOV2 { - static const bool IsColMajor = (Layout == ColMajor); - - typedef StridedLinearBufferCopy LinCopy; - - public: - typedef DSizes Dimensions; - typedef DSizes DimensionsMap; - - struct Dst { - Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, - IndexType dst_offset = 0) - : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} - - Dimensions dims; - Dimensions strides; - Scalar* data; - IndexType offset; - }; - - struct Src { - Src(const Dimensions& src_strides, const Scalar* src, - IndexType src_offset = 0) - : strides(src_strides), data(src), offset(src_offset) {} - - Dimensions strides; - const Scalar* data; - IndexType offset; - }; - - // Copies data to `dst` from `src`, using provided dimensions mapping: - // - // src_dimension_index = dst_to_src_dim_map[dst_dimension_index] - // - // Returns the number of copied elements. - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy( - const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) { - // Copy single scalar value from `src` to `dst`. - if (NumDims == 0) { - *(dst.data + dst.offset) = *(src.data + src.offset); - return 1; - } - - // Both `dst` and `src` must have contiguous innermost dimension. We also - // accept the special case with stride '0', because it's used as a trick to - // implement broadcasting. - { - int inner_dim = IsColMajor ? 0 : NumDims - 1; - EIGEN_UNUSED_VARIABLE(inner_dim); - eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0); - eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0); - } - - // Give a shorter name to `dst_to_src_dim_map`. - const DimensionsMap& dim_map = dst_to_src_dim_map; - - // Do not squeeze reordered inner dimensions. - int num_squeezable_dims = NumSqueezableInnerDims(dim_map); - - // NOTE: We find the innermost dimension (contiguous in memory) in the dst - // block, and we write data linearly into that dimension, reading it from - // the src. If dimensions are reordered, we might end up reading data from - // the src with `stride != 1`. - // - // NOTE: Random-Read/Linear-Write can be up to ~2X faster than - // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680 - - // Find the innermost dimension in the dst whose size is not 1. This is the - // effective inner dim. - int num_size_one_inner_dims = 0; - for (int i = 0; i < num_squeezable_dims; ++i) { - const int dst_dim = IsColMajor ? i : NumDims - i - 1; - if (dst.dims[dst_dim] != 1) break; - num_size_one_inner_dims++; - } - - // If all dimensions are of size 1, just copy a scalar from `src` to `dst`. - if (num_size_one_inner_dims == NumDims) { - *(dst.data + dst.offset) = *(src.data + src.offset); - return 1; - } - - // Outermost dimension in the dst with `stride == 1` (contiguous in memory). - const int dst_stride1_dim = IsColMajor - ? num_size_one_inner_dims - : NumDims - num_size_one_inner_dims - 1; - - // Dimension in the src that corresponds to the dst innermost dimension. - const int src_dim_for_dst_stride1_dim = - NumDims == 0 ? 1 : dim_map[dst_stride1_dim]; - - // Size of the innermost dimension (length of contiguous blocks of memory). - IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim]; - - // Squeeze multiple inner dims into one if they are contiguous in `dst` and - // `src` memory, so we can do less linear copy calls. - for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) { - const int dst_dim = IsColMajor ? i : NumDims - i - 1; - const IndexType dst_stride = dst.strides[dst_dim]; - const IndexType src_stride = src.strides[dim_map[dst_dim]]; - if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) { - dst_inner_dim_size *= dst.dims[dst_dim]; - ++num_size_one_inner_dims; - } else { - break; - } - } - - // Setup strides to read data from `src` and write to `dst`. - IndexType input_offset = src.offset; - IndexType output_offset = dst.offset; - IndexType input_stride = - NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim]; - IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim]; - - const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; - array it; - - // Initialize block iterator state. Squeeze away any dimension of size 1. - int idx = 0; // currently initialized iterator state index - for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { - const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2; - if (dst.dims[dst_dim] == 1) continue; - - it[idx].size = dst.dims[dst_dim]; - it[idx].input_stride = src.strides[dim_map[dst_dim]]; - it[idx].output_stride = dst.strides[dst_dim]; - - it[idx].input_span = it[idx].input_stride * (it[idx].size - 1); - it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); - - idx++; - } - - // Iterate copying data from src to dst. - const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); - -#define COPY_INNER_DIM(KIND) \ - IndexType num_copied = 0; \ - for (num_copied = 0; num_copied < block_total_size; \ - num_copied += dst_inner_dim_size) { \ - LinCopy::template Run( \ - typename LinCopy::Dst(output_offset, output_stride, dst.data), \ - typename LinCopy::Src(input_offset, input_stride, src.data), \ - dst_inner_dim_size); \ - \ - for (int j = 0; j < idx; ++j) { \ - if (++it[j].count < it[j].size) { \ - input_offset += it[j].input_stride; \ - output_offset += it[j].output_stride; \ - break; \ - } \ - it[j].count = 0; \ - input_offset -= it[j].input_span; \ - output_offset -= it[j].output_span; \ - } \ - } \ - return num_copied; - - if (input_stride == 1 && output_stride == 1) { - COPY_INNER_DIM(LinCopy::Linear); - } else if (input_stride == 1 && output_stride != 1) { - COPY_INNER_DIM(LinCopy::Scatter); - } else if (input_stride == 0 && output_stride == 1) { - COPY_INNER_DIM(LinCopy::FillLinear); - } else if (input_stride == 0 && output_stride != 1) { - COPY_INNER_DIM(LinCopy::FillScatter); - } else if (output_stride == 1) { - COPY_INNER_DIM(LinCopy::Gather); - } else { - COPY_INNER_DIM(LinCopy::Random); - } - -#undef COPY_INNER_DIM - } - - // Copy from `src` to `dst` with an identity src->dst dimension map. Returns - // the number of copied elements. - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, - const Src& src) { - DimensionsMap dst_to_src_map; - for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i; - return Copy(dst, src, dst_to_src_map); - } - - private: - struct BlockIteratorState { - BlockIteratorState() - : size(0), - count(0), - input_stride(0), - output_stride(0), - input_span(0), - output_span(0) {} - - IndexType size; - IndexType count; - IndexType input_stride; - IndexType output_stride; - IndexType input_span; - IndexType output_span; - }; - - // Compute how many inner dimensions it's allowed to squeeze when doing IO - // between two tensor blocks. It's safe to squeeze inner dimensions, only - // if they are not reordered. - static int NumSqueezableInnerDims(const DimensionsMap& dim_map) { - int num_squeezable_dims = 0; - for (int i = 0; i < NumDims; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - if (dim_map[dim] != dim) break; - num_squeezable_dims++; - } - return num_squeezable_dims; - } -}; - -// -------------------------------------------------------------------------- // -// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to -// a Tensor block defined by `desc`, backed by a memory buffer at `target`. -// -// Currently there is no way to write from a Tensor expression to a block of -// memory, if dimensions are reordered. If you need to do that, you should -// materialize a Tensor block expression into a memory buffer, and then use -// TensorBlockIO to copy data between two memory buffers with a custom -// `target->src` dimension map (see definition above). -// -// Also currently the innermost dimension of `target` must have a stride '1' -// (contiguous in memory). This restriction could be lifted with a `pscatter`, -// but in practice it's never needed, and there is a similar TensorBlockIO -// workaround for that. -// -// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO -// where `src` is a tensor expression. Explore if it is possible to rewrite IO -// to use expressions instead of pointers, and after that TensorBlockAssignment -// will become an alias to IO. -template -class TensorBlockAssignment { - // We will use coeff/packet path to evaluate block expressions. - typedef TensorEvaluator - TensorBlockEvaluator; - - typedef DSizes Dimensions; - - enum { - Vectorizable = packet_traits::Vectorizable, - PacketSize = packet_traits::size - }; - - template - struct InnerDimAssign { - EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, - const Evaluator& eval, - IndexType eval_offset) { - for (IndexType i = 0; i < count; ++i) { - target[i] = eval.coeff(eval_offset + i); - } - } - }; - - template - struct InnerDimAssign { - EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, - const Evaluator& eval, - IndexType eval_offset) { - typedef typename packet_traits::type Packet; - - const IndexType unrolled_size = count - 4 * PacketSize; - const IndexType vectorized_size = count - PacketSize; - IndexType i = 0; - - for (; i <= unrolled_size; i += 4 * PacketSize) { - for (int j = 0; j < 4; ++j) { - const IndexType idx = eval_offset + i + j * PacketSize; - Packet p = eval.template packet(idx); - pstoreu(target + i + j * PacketSize, p); - } - } - - for (; i <= vectorized_size; i += PacketSize) { - Packet p = eval.template packet(eval_offset + i); - pstoreu(target + i, p); - } - - for (; i < count; ++i) { - target[i] = eval.coeff(eval_offset + i); - } - } - }; - - public: - struct Target { - Target(const Dimensions& target_dims, const Dimensions& target_strides, - Scalar* target_data, IndexType target_offset = 0) - : dims(target_dims), - strides(target_strides), - data(target_data), - offset(target_offset) {} - - Dimensions dims; - Dimensions strides; - Scalar* data; - IndexType offset; - }; - - static Target target(const Dimensions& target_dims, - const Dimensions& target_strides, Scalar* target_data, - IndexType target_offset = 0) { - return Target(target_dims, target_strides, target_data, target_offset); - } - - template - static Target target( - const DSizes& target_dims, - const DSizes& target_strides, - Scalar* target_data, IndexType target_offset = 0) { - // DSizes constructor will do index type promotion if it's safe. - return Target(Dimensions(target_dims), Dimensions(target_strides), - target_data, target_offset); - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const Target& target, const TensorBlockExpr& expr) { - // Prepare evaluator for block expression. - DefaultDevice default_device; - TensorBlockEvaluator eval(expr, default_device); - - // Tensor block expression dimension should match destination dimensions. - eigen_assert(dimensions_match(target.dims, eval.dimensions())); - - static const int Layout = TensorBlockEvaluator::Layout; - static const bool is_col_major = Layout == ColMajor; - - // Initialize output inner dimension size based on a layout. - const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize(); - const int inner_dim_idx = is_col_major ? 0 : NumDims - 1; - IndexType output_inner_dim_size = target.dims[inner_dim_idx]; - - // Target inner dimension stride must be '1'. - eigen_assert(target.strides[inner_dim_idx] == 1); - - // Squeeze multiple inner dims into one if they are contiguous in `target`. - IndexType num_squeezed_dims = 0; - for (Index i = 1; i < NumDims; ++i) { - const Index dim = is_col_major ? i : NumDims - i - 1; - const IndexType target_stride = target.strides[dim]; - - if (output_inner_dim_size == target_stride) { - output_inner_dim_size *= target.dims[dim]; - num_squeezed_dims++; - } else { - break; - } - } - - // Initialize output block iterator state. Dimension in this array are - // always in inner_most -> outer_most order (col major layout). - array it; - - int idx = 0; // currently initialized iterator state index - for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) { - const Index dim = is_col_major ? i + 1 : NumDims - i - 2; - - it[idx].count = 0; - it[idx].size = target.dims[dim]; - it[idx].output_stride = target.strides[dim]; - it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); - idx++; - } - - // We read block expression from the beginning, and start writing data to - // `target` at given offset. - IndexType input_offset = 0; - IndexType output_offset = target.offset; - - // Iterate copying data from `eval` to `target`. - for (IndexType i = 0; i < output_size; i += output_inner_dim_size) { - // Assign to `target` at current offset. - InnerDimAssign::Run(target.data + output_offset, - output_inner_dim_size, eval, - input_offset); - - // Move input offset forward by the number of assigned coefficients. - input_offset += output_inner_dim_size; - - // Update index. - for (int j = 0; j < idx; ++j) { - if (++it[j].count < it[j].size) { - output_offset += it[j].output_stride; - break; - } - it[j].count = 0; - output_offset -= it[j].output_span; - } - } - } - - private: - struct BlockIteratorState { - BlockIteratorState() - : count(0), size(0), output_stride(0), output_span(0) {} - - IndexType count; - IndexType size; - IndexType output_stride; - IndexType output_span; - }; -}; - -// -------------------------------------------------------------------------- // - -} // namespace internal -} // namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 454b0f752..620c8741c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -114,7 +114,7 @@ struct TensorEvaluator, Device> enum { IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = TensorEvaluator::BlockAccessV2, + BlockAccess = TensorEvaluator::BlockAccess, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, RawAccess = false @@ -130,12 +130,12 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockDescriptor TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - typedef typename TensorEvaluator::TensorBlockV2 + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; typedef typename internal::TensorMaterializedBlock - TensorBlockV2; + TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, @@ -617,19 +617,19 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockV2ResourceRequirements getResourceRequirements() const { + internal::TensorBlockResourceRequirements getResourceRequirements() const { // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large // tensors. But this might need further tuning. const size_t target_block_size = numext::maxi( 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); - return internal::TensorBlockV2ResourceRequirements::merge( - {internal::TensorBlockV2ShapeType::kSkewedInnerDims, target_block_size}, + return internal::TensorBlockResourceRequirements::merge( + {internal::TensorBlockShapeType::kSkewedInnerDims, target_block_size}, m_impl.getResourceRequirements()); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { BlockBroadcastingParams params = blockBroadcastingParams(desc); @@ -638,8 +638,8 @@ struct TensorEvaluator, Device> } // Prepare storage for the materialized broadcasting result. - const typename TensorBlockV2::Storage block_storage = - TensorBlockV2::prepareStorage(desc, scratch); + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); ScalarNoConst* materialized_output = block_storage.data(); // We potentially will need to materialize input blocks. @@ -843,10 +843,10 @@ struct TensorEvaluator, Device> return params; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 emptyBlock() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const { DSizes dimensions; for (int i = 0; i < NumDims; ++i) dimensions[i] = 0; - return TensorBlockV2(internal::TensorBlockKind::kView, NULL, dimensions); + return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim( @@ -856,7 +856,7 @@ struct TensorEvaluator, Device> size_t* materialized_input_size) const { if (params.bcast_dim_size == 1) { // We just need one block read using the ready-set values above. - return BroadcastBlockV2( + return BroadcastBlock( params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch, @@ -873,7 +873,7 @@ struct TensorEvaluator, Device> params.bcast_block_strides[broadcast_bcast_dim] = params.output_strides[params.bcast_dim]; - return BroadcastBlockV2( + return BroadcastBlock( params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch, @@ -942,7 +942,7 @@ struct TensorEvaluator, Device> params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim]; - num_output_coeffs += BroadcastBlockV2( + num_output_coeffs += BroadcastBlock( params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch, @@ -964,7 +964,7 @@ struct TensorEvaluator, Device> const Index offset = (first_multiple - bcast_dim_left_index) * m_outputStrides[params.bcast_dim]; - num_output_coeffs += BroadcastBlockV2( + num_output_coeffs += BroadcastBlock( params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, params.bcast_block_strides, params.bcast_input_strides, bcast_offset, offset, scratch, @@ -987,7 +987,7 @@ struct TensorEvaluator, Device> const Index offset = (last_multiple - bcast_dim_left_index) * m_outputStrides[params.bcast_dim]; - num_output_coeffs += BroadcastBlockV2( + num_output_coeffs += BroadcastBlock( params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, params.bcast_block_strides, params.bcast_input_strides, bcast_offset, offset, scratch, @@ -1005,7 +1005,7 @@ struct TensorEvaluator, Device> params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim]; - num_output_coeffs += BroadcastBlockV2( + num_output_coeffs += BroadcastBlock( params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes, params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch, @@ -1016,7 +1016,7 @@ struct TensorEvaluator, Device> } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockV2( + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock( const Dimensions& input_block_sizes, const Dimensions& input_block_strides, const BroadcastDimensions& bcast_block_sizes, @@ -1032,7 +1032,7 @@ struct TensorEvaluator, Device> IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset), input_block_sizes); - ArgTensorBlock input_block = m_impl.blockV2(input_desc, scratch); + ArgTensorBlock input_block = m_impl.block(input_desc, scratch); // ---------------------------------------------------------------------- // // Materialize input block into a temporary memory buffer only if it's not @@ -1071,14 +1071,14 @@ struct TensorEvaluator, Device> // ---------------------------------------------------------------------- // // Copy data from materialized input block to the materialized output, using // given broadcast strides (strides with zeroes). - typedef internal::TensorBlockIOV2 - TensorBlockIOV2; + typedef internal::TensorBlockIO + TensorBlockIO; - typename TensorBlockIOV2::Src src(bcast_input_strides, input_buffer); - typename TensorBlockIOV2::Dst dst(bcast_block_sizes, bcast_block_strides, + typename TensorBlockIO::Src src(bcast_input_strides, input_buffer); + typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides, materialized_output + offset); - return TensorBlockIOV2::Copy(dst, src); + return TensorBlockIO::Copy(dst, src); } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 268c3246a..f51a8559d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -148,7 +148,7 @@ struct TensorEvaluator, Device> IsAligned = false, Layout = TensorEvaluator::Layout, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = TensorEvaluator::BlockAccessV2, + BlockAccess = TensorEvaluator::BlockAccess, // Chipping of outer-most dimension is a trivial operation, because we can // read and write directly from the underlying tensor using single offset. IsOuterChipping = (static_cast(Layout) == ColMajor && DimId == NumInputDims - 1) || @@ -172,12 +172,12 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockDescriptor ArgTensorBlockDesc; - typedef typename TensorEvaluator::TensorBlockV2 + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; typedef typename internal::TensorMaterializedBlock - TensorBlockV2; + TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -295,17 +295,17 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockV2ResourceRequirements getResourceRequirements() const { + internal::TensorBlockResourceRequirements getResourceRequirements() const { const size_t target_block_size = numext::maxi(1, m_device.lastLevelCacheSize() / sizeof(Scalar)); - return internal::TensorBlockV2ResourceRequirements::merge( - {internal::TensorBlockV2ShapeType::kSkewedInnerDims, target_block_size}, + return internal::TensorBlockResourceRequirements::merge( + {internal::TensorBlockShapeType::kSkewedInnerDims, target_block_size}, m_impl.getResourceRequirements()); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool root_of_expr_ast = false) const { const Index chip_dim = m_dim.actualDim(); @@ -334,20 +334,20 @@ struct TensorEvaluator, Device> arg_destination_strides); } - ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch, root_of_expr_ast); + ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast); if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); if (arg_block.data() != NULL) { // Forward argument block buffer if possible. - return TensorBlockV2(arg_block.kind(), arg_block.data(), + return TensorBlock(arg_block.kind(), arg_block.data(), desc.dimensions()); } else { // Assign argument block expression to a buffer. // Prepare storage for the materialized chipping result. - const typename TensorBlockV2::Storage block_storage = - TensorBlockV2::prepareStorage(desc, scratch); + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); typedef internal::TensorBlockAssignment< ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index> @@ -442,7 +442,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = TensorEvaluator::RawAccess, + BlockAccess = TensorEvaluator::RawAccess, Layout = TensorEvaluator::Layout, RawAccess = false }; @@ -499,9 +499,9 @@ struct TensorEvaluator, Device> } } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2( - const TensorBlockDesc& desc, const TensorBlockV2& block) { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { assert(this->m_impl.data() != NULL); const Index chip_dim = this->m_dim.actualDim(); @@ -514,7 +514,7 @@ struct TensorEvaluator, Device> } typedef TensorReshapingOp, - const typename TensorBlockV2::XprType> + const typename TensorBlock::XprType> TensorBlockExpr; typedef internal::TensorBlockAssignment::PacketAccess && TensorEvaluator::PacketAccess, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess || TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, @@ -133,7 +133,7 @@ struct TensorEvaluator::PacketAccess && TensorEvaluator::PacketAccess, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess || TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, @@ -332,7 +332,7 @@ template::size > 1), - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -389,7 +389,7 @@ struct TensorContractionEvaluatorBase }; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// // Most of the code is assuming that both input tensors are ColMajor. If the diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index f9f90ec02..cdbafbbb1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -302,7 +302,7 @@ struct TensorEvaluator, Device> TensorEvaluator::PacketAccess & internal::type_casting_traits::VectorizedCast, #endif - BlockAccessV2 = TensorEvaluator::BlockAccessV2, + BlockAccess = TensorEvaluator::BlockAccess, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, RawAccess = false @@ -314,7 +314,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockDescriptor TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - typedef typename TensorEvaluator::TensorBlockV2 + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; struct TensorConversionOpBlockFactory { @@ -331,7 +331,7 @@ struct TensorEvaluator, Device> typedef internal::TensorUnaryExprBlock - TensorBlockV2; + TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -398,14 +398,14 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockV2ResourceRequirements getResourceRequirements() const { + internal::TensorBlockResourceRequirements getResourceRequirements() const { return m_impl.getResourceRequirements(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { - return TensorBlockV2(m_impl.blockV2(desc, scratch), + return TensorBlock(m_impl.block(desc, scratch), TensorConversionOpBlockFactory()); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 44068fedc..27ad9f147 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -309,7 +309,7 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -317,7 +317,7 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -794,7 +794,7 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -302,7 +302,7 @@ struct TensorEvaluator, Devi enum { IsAligned = false, PacketAccess = (PacketType::size > 1), - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -103,7 +103,7 @@ struct TensorEvaluator, Devi }; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) @@ -268,7 +268,7 @@ struct TensorEvaluator::size > 1), - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -276,7 +276,7 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = true, + BlockAccess = true, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -123,7 +123,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockDescriptor TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - typedef typename TensorEvaluator::TensorBlockV2 + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; typedef internal::TensorBlockAssignment< @@ -165,11 +165,11 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockV2ResourceRequirements getResourceRequirements() const { + internal::TensorBlockResourceRequirements getResourceRequirements() const { return m_impl.getResourceRequirements(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2( + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock( TensorBlockDesc& desc, TensorBlockScratch& scratch) { // Add `m_buffer` as destination buffer to the block descriptor. desc.template AddDestinationBuffer( @@ -177,7 +177,7 @@ struct TensorEvaluator, Device> /*dst_strides=*/internal::strides(m_impl.dimensions())); ArgTensorBlock block = - m_impl.blockV2(desc, scratch, /*root_of_expr_ast=*/true); + m_impl.block(desc, scratch, /*root_of_expr_ast=*/true); // If block was evaluated into a destination buffer, there is no need to do // an assignment. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 613a8347d..146cc325e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -45,7 +45,7 @@ struct TensorEvaluator enum { IsAligned = Derived::IsAligned, PacketAccess = (PacketType::size > 1), - BlockAccessV2 = internal::is_arithmetic::type>::value, + BlockAccess = internal::is_arithmetic::type>::value, PreferBlockAccess = false, Layout = Derived::Layout, CoordAccess = NumCoords > 0, @@ -60,7 +60,7 @@ struct TensorEvaluator typedef typename internal::TensorMaterializedBlock - TensorBlockV2; + TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) @@ -150,23 +150,23 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockV2ResourceRequirements getResourceRequirements() const { - return internal::TensorBlockV2ResourceRequirements::any(); + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { assert(m_data != NULL); - return TensorBlockV2::materialize(m_data, m_dims, desc, scratch); + return TensorBlock::materialize(m_data, m_dims, desc, scratch); } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2( - const TensorBlockDesc& desc, const TensorBlockV2& block) { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { assert(m_data != NULL); - typedef typename TensorBlockV2::XprType TensorBlockExpr; + typedef typename TensorBlock::XprType TensorBlockExpr; typedef internal::TensorBlockAssignment TensorBlockAssign; @@ -246,7 +246,7 @@ struct TensorEvaluator enum { IsAligned = Derived::IsAligned, PacketAccess = (PacketType::size > 1), - BlockAccessV2 = internal::is_arithmetic::value, + BlockAccess = internal::is_arithmetic::value, PreferBlockAccess = false, Layout = Derived::Layout, CoordAccess = NumCoords > 0, @@ -259,7 +259,7 @@ struct TensorEvaluator typedef typename internal::TensorMaterializedBlock - TensorBlockV2; + TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) @@ -323,15 +323,15 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockV2ResourceRequirements getResourceRequirements() const { - return internal::TensorBlockV2ResourceRequirements::any(); + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { assert(m_data != NULL); - return TensorBlockV2::materialize(m_data, m_dims, desc, scratch); + return TensorBlock::materialize(m_data, m_dims, desc, scratch); } EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } @@ -378,7 +378,7 @@ struct TensorEvaluator, Device> && (PacketType::size >1) #endif , - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -386,7 +386,7 @@ struct TensorEvaluator, Device> }; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -448,7 +448,7 @@ struct TensorEvaluator, Device> IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, - BlockAccessV2 = TensorEvaluator::BlockAccessV2, + BlockAccess = TensorEvaluator::BlockAccess, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -476,11 +476,11 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockDescriptor TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - typedef typename TensorEvaluator::TensorBlockV2 + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; typedef internal::TensorCwiseUnaryBlock - TensorBlockV2; + TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -520,14 +520,14 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockV2ResourceRequirements getResourceRequirements() const { + internal::TensorBlockResourceRequirements getResourceRequirements() const { return m_argImpl.getResourceRequirements(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { - return TensorBlockV2(m_argImpl.blockV2(desc, scratch), m_functor); + return TensorBlock(m_argImpl.block(desc, scratch), m_functor); } EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } @@ -560,8 +560,8 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, - BlockAccessV2 = TensorEvaluator::BlockAccessV2 & - TensorEvaluator::BlockAccessV2, + BlockAccess = TensorEvaluator::BlockAccess & + TensorEvaluator::BlockAccess, PreferBlockAccess = TensorEvaluator::PreferBlockAccess | TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, @@ -595,14 +595,14 @@ struct TensorEvaluator TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - typedef typename TensorEvaluator::TensorBlockV2 + typedef typename TensorEvaluator::TensorBlock LeftTensorBlock; - typedef typename TensorEvaluator::TensorBlockV2 + typedef typename TensorEvaluator::TensorBlock RightTensorBlock; typedef internal::TensorCwiseBinaryBlock - TensorBlockV2; + TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -653,18 +653,18 @@ struct TensorEvaluator::PacketAccess && TensorEvaluator::PacketAccess && internal::functor_traits::PacketAccess, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess || TensorEvaluator::PreferBlockAccess || TensorEvaluator::PreferBlockAccess, @@ -739,7 +739,7 @@ struct TensorEvaluator PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & PacketType::HasBlend, - BlockAccessV2 = TensorEvaluator::BlockAccessV2 && - TensorEvaluator::BlockAccessV2 && - TensorEvaluator::BlockAccessV2, + BlockAccess = TensorEvaluator::BlockAccess && + TensorEvaluator::BlockAccess && + TensorEvaluator::BlockAccess, PreferBlockAccess = TensorEvaluator::PreferBlockAccess || TensorEvaluator::PreferBlockAccess || TensorEvaluator::PreferBlockAccess, @@ -850,11 +850,11 @@ struct TensorEvaluator typedef internal::TensorBlockDescriptor TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - typedef typename TensorEvaluator::TensorBlockV2 + typedef typename TensorEvaluator::TensorBlock IfArgTensorBlock; - typedef typename TensorEvaluator::TensorBlockV2 + typedef typename TensorEvaluator::TensorBlock ThenArgTensorBlock; - typedef typename TensorEvaluator::TensorBlockV2 + typedef typename TensorEvaluator::TensorBlock ElseArgTensorBlock; struct TensorSelectOpBlockFactory { @@ -873,7 +873,7 @@ struct TensorEvaluator typedef internal::TensorTernaryExprBlock - TensorBlockV2; + TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -933,24 +933,24 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockV2ResourceRequirements getResourceRequirements() const { - return internal::TensorBlockV2ResourceRequirements::merge( + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::merge( m_condImpl.getResourceRequirements(), - internal::TensorBlockV2ResourceRequirements::merge( + internal::TensorBlockResourceRequirements::merge( m_thenImpl.getResourceRequirements(), m_elseImpl.getResourceRequirements())); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { // It's unsafe to pass destination buffer to underlying expressions, because // output might be aliased with one of the inputs. desc.DropDestinationBuffer(); - return TensorBlockV2( - m_condImpl.blockV2(desc, scratch), m_thenImpl.blockV2(desc, scratch), - m_elseImpl.blockV2(desc, scratch), TensorSelectOpBlockFactory()); + return TensorBlock( + m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch), + m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 7b7b670ed..b2327da1e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -172,7 +172,7 @@ class TensorExecutor + typedef TensorBlockMapper TensorBlockMapper; typedef internal::TensorBlockDescriptor @@ -187,7 +187,7 @@ class TensorExecutor GetTensorExecutorTilingContext( const ThreadPoolDevice& device, const Evaluator& evaluator, bool allocate_buffer = true) { // Query expression tree for desired block size/shape. - const TensorBlockV2ResourceRequirements requirements = + const TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements(); int num_threads = device.numThreads(); @@ -377,7 +377,7 @@ class TensorExecutor::NumDimensions; typedef TensorEvaluator Evaluator; - typedef TensorBlockV2Mapper BlockMapper; + typedef TensorBlockMapper BlockMapper; typedef TensorExecutorTilingContext TilingContext; typedef internal::TensorBlockDescriptor @@ -402,7 +402,7 @@ class TensorExecutor::NumDimensions; typedef TensorEvaluator Evaluator; - typedef TensorBlockV2Mapper BlockMapper; + typedef TensorBlockMapper BlockMapper; typedef TensorExecutorTilingContext TilingContext; typedef internal::TensorBlockDescriptor TensorBlockDesc; @@ -510,7 +510,7 @@ class TensorAsyncExecutortiling.block_mapper.blockDescriptor(block_idx); - ctx->evaluator.evalBlockV2(desc, scratch); + ctx->evaluator.evalBlock(desc, scratch); scratch.reset(); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index a8841bc38..c62bc5fa9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -133,7 +133,7 @@ struct TensorEvaluator, D enum { IsAligned = false, PacketAccess = true, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, @@ -141,7 +141,7 @@ struct TensorEvaluator, D }; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index ea3ea2c91..a5be54bcd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -41,7 +41,7 @@ class TensorFixedSize : public TensorBase0), PacketAccess = (internal::packet_traits::size > 1), - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = false, Layout = Options_ & RowMajor ? RowMajor : ColMajor, CoordAccess = true, @@ -49,7 +49,7 @@ class TensorFixedSize : public TensorBase, Device> enum { IsAligned = true, PacketAccess = (PacketType::size > 1), - BlockAccessV2 = internal::is_arithmetic::value, + BlockAccess = internal::is_arithmetic::value, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = true @@ -110,7 +110,7 @@ struct TensorEvaluator, Device> typedef typename internal::TensorMaterializedBlock - TensorBlockV2; + TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -177,15 +177,15 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockV2ResourceRequirements getResourceRequirements() const { - return internal::TensorBlockV2ResourceRequirements::any(); + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { assert(m_buffer != NULL); - return TensorBlockV2::materialize(m_buffer, m_impl.dimensions(), desc, scratch); + return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index b115e502b..246ebe44e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -157,7 +157,7 @@ struct IsVectorizable { // Tiled evaluation strategy. enum TiledEvaluation { Off = 0, // tiled evaluation is not supported - On = 1, // still work in progress (see TensorBlockV2.h) + On = 1, // still work in progress (see TensorBlock.h) }; template @@ -165,12 +165,12 @@ struct IsTileable { // Check that block evaluation is supported and it's a preferred option (at // least one sub-expression has much faster block evaluation, e.g. // broadcasting). - static const bool BlockAccessV2 = - TensorEvaluator::BlockAccessV2 && + static const bool BlockAccess = + TensorEvaluator::BlockAccess && TensorEvaluator::PreferBlockAccess; static const TiledEvaluation value = - BlockAccessV2 ? TiledEvaluation::On : TiledEvaluation::Off; + BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off; }; template , Device> enum { IsAligned = false, PacketAccess = (PacketType::size > 1), - BlockAccessV2 = true, + BlockAccess = true, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -108,7 +108,7 @@ struct TensorEvaluator, Device> typedef typename internal::TensorMaterializedBlock - TensorBlockV2; + TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -165,10 +165,10 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockV2ResourceRequirements getResourceRequirements() const { + internal::TensorBlockResourceRequirements getResourceRequirements() const { const size_t target_block_size = numext::maxi( 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); - return {internal::TensorBlockV2ShapeType::kSkewedInnerDims, + return {internal::TensorBlockShapeType::kSkewedInnerDims, target_block_size}; } @@ -179,8 +179,8 @@ struct TensorEvaluator, Device> Index count; }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { static const bool is_col_major = static_cast(Layout) == static_cast(ColMajor); @@ -206,8 +206,8 @@ struct TensorEvaluator, Device> eigen_assert(it[0].stride == 1); // Prepare storage for the materialized generator result. - const typename TensorBlockV2::Storage block_storage = - TensorBlockV2::prepareStorage(desc, scratch); + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); CoeffReturnType* block_buffer = block_storage.data(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 5010d5c95..49d1004f3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -231,7 +231,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, @@ -239,7 +239,7 @@ struct TensorEvaluator, Device> }; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index ef6b62620..7dadec7fb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -92,7 +92,7 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -100,7 +100,7 @@ struct TensorEvaluator, Device> }; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index 695726e10..05fa80e59 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -119,7 +119,7 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented @@ -127,7 +127,7 @@ struct TensorEvaluator, Device> }; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -198,14 +198,14 @@ template enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false // to be implemented }; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 7697add4b..5c2036626 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -138,7 +138,7 @@ struct TensorEvaluator, Device> // For trivial reshapes with raw access to underlying data we will provide // zero overhead block access. // TODO(ezhulenev): Consider adding block access without raw access? - BlockAccessV2 = TensorEvaluator::RawAccess && + BlockAccess = TensorEvaluator::RawAccess && NumInputDims > 0 && NumOutputDims > 0, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, @@ -155,7 +155,7 @@ struct TensorEvaluator, Device> typedef typename internal::TensorMaterializedBlock - TensorBlockV2; + TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -199,8 +199,8 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockV2ResourceRequirements getResourceRequirements() const { - return internal::TensorBlockV2ResourceRequirements::any(); + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); } // required in block(OutputTensorBlock* output_block) const @@ -212,8 +212,8 @@ struct TensorEvaluator, Device> Index count; }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { eigen_assert(m_impl.data() != NULL); eigen_assert((kind == Runtime) || @@ -223,12 +223,12 @@ struct TensorEvaluator, Device> if (kind == OneByN || kind == NByOne) { // We can guarantee at compile time that block is just a contiguous slice // of the underlying expression memory buffer. - return TensorBlockV2(internal::TensorBlockKind::kView, + return TensorBlock(internal::TensorBlockKind::kView, m_impl.data() + desc.offset(), desc.dimensions()); } else { // This will do additional runtime checks, and in the end it might be also // a view, or it might be a block materialized in the temporary buffer. - return TensorBlockV2::materialize(m_impl.data(), m_dimensions, desc, + return TensorBlock::materialize(m_impl.data(), m_dimensions, desc, scratch); } } @@ -264,7 +264,7 @@ template enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = TensorEvaluator::RawAccess, + BlockAccess = TensorEvaluator::RawAccess, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -297,7 +297,7 @@ template } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2( + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( const TensorBlockDesc& desc, const TensorBlock& block) { assert(this->m_impl.data() != NULL); @@ -456,7 +456,7 @@ struct TensorEvaluator, Devi // slice offsets and sizes. IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = TensorEvaluator::BlockAccessV2, + BlockAccess = TensorEvaluator::BlockAccess, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, @@ -470,8 +470,8 @@ struct TensorEvaluator, Devi typedef internal::TensorBlockScratchAllocator TensorBlockScratch; // Tensor slicing does not change the block type. - typedef typename TensorEvaluator::TensorBlockV2 - TensorBlockV2; + typedef typename TensorEvaluator::TensorBlock + TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -547,7 +547,7 @@ struct TensorEvaluator, Devi } } // Use memcpy if it's going to be faster than using the regular evaluation. - const MemcpyTriggerForSlicing trigger(m_device); + const MemcpyTriggerForSlicing trigger(m_device); if (trigger(internal::array_prod(dimensions()), contiguous_values)) { EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data(); for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { @@ -633,19 +633,19 @@ struct TensorEvaluator, Devi } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockV2ResourceRequirements getResourceRequirements() const { + internal::TensorBlockResourceRequirements getResourceRequirements() const { const size_t target_block_size = numext::maxi(1, m_device.lastLevelCacheSize() / sizeof(Scalar)); - return internal::TensorBlockV2ResourceRequirements::merge( - {internal::TensorBlockV2ShapeType::kSkewedInnerDims, target_block_size}, + return internal::TensorBlockResourceRequirements::merge( + {internal::TensorBlockShapeType::kSkewedInnerDims, target_block_size}, m_impl.getResourceRequirements()); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset())); - TensorBlockV2 block = m_impl.blockV2(arg_desc, scratch); + TensorBlock block = m_impl.block(arg_desc, scratch); if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); return block; } @@ -745,7 +745,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = TensorEvaluator::BlockAccessV2, + BlockAccess = TensorEvaluator::BlockAccess, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, @@ -823,11 +823,11 @@ struct TensorEvaluator, Device> } } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2( - const TensorBlockDesc& desc, const TensorBlockV2& block) { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset())); - this->m_impl.writeBlockV2(arg_desc, block); + this->m_impl.writeBlock(arg_desc, block); } }; @@ -935,14 +935,14 @@ struct TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, RawAccess = false }; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -1116,7 +1116,7 @@ struct TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = TensorEvaluator::CoordAccess, @@ -1124,7 +1124,7 @@ struct TensorEvaluator, Device enum { IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = TensorEvaluator::RawAccess, + BlockAccess = TensorEvaluator::RawAccess, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = true, @@ -113,7 +113,7 @@ struct TensorEvaluator, Device typedef typename internal::TensorMaterializedBlock - TensorBlockV2; + TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -228,20 +228,20 @@ struct TensorEvaluator, Device } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockV2ResourceRequirements getResourceRequirements() const { + internal::TensorBlockResourceRequirements getResourceRequirements() const { const size_t target_block_size = numext::maxi(1, m_device.lastLevelCacheSize() / sizeof(Scalar)); - return internal::TensorBlockV2ResourceRequirements::merge( - {internal::TensorBlockV2ShapeType::kSkewedInnerDims, target_block_size}, + return internal::TensorBlockResourceRequirements::merge( + {internal::TensorBlockShapeType::kSkewedInnerDims, target_block_size}, m_impl.getResourceRequirements()); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { // If one of the dimensions is zero, return empty block view. if (desc.size() == 0) { - return TensorBlockV2(internal::TensorBlockKind::kView, NULL, + return TensorBlock(internal::TensorBlockKind::kView, NULL, desc.dimensions()); } @@ -355,8 +355,8 @@ struct TensorEvaluator, Device typedef internal::StridedLinearBufferCopy LinCopy; // Prepare storage for the materialized padding result. - const typename TensorBlockV2::Storage block_storage = - TensorBlockV2::prepareStorage(desc, scratch); + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); // Iterate copying data from `m_impl.data()` to the output buffer. for (Index size = 0; size < output_size; size += output_inner_dim_size) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 4abe58ecd..64a436e50 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -96,7 +96,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, @@ -104,7 +104,7 @@ struct TensorEvaluator, Device> }; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index c600c319d..ad14d82f8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -584,7 +584,7 @@ struct TensorReductionEvaluatorBase::Layout, CoordAccess = false, // to be implemented @@ -594,7 +594,7 @@ struct TensorReductionEvaluatorBase::type ScalarNoConst; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index ff5bfad46..4c6420586 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -141,7 +141,7 @@ template class TensorRef : public TensorBase class TensorRef : public TensorBase, Device> enum { IsAligned = false, PacketAccess = false, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = false, Layout = TensorRef::Layout, CoordAccess = false, // to be implemented @@ -385,7 +385,7 @@ struct TensorEvaluator, Device> }; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef& m, const Device&) @@ -430,13 +430,13 @@ struct TensorEvaluator, Device> : public TensorEvaluator& m, const Device& d) : Base(m, d) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 68699351b..c4ac81db8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -115,7 +115,7 @@ struct TensorEvaluator, Device enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = NumDims > 0, + BlockAccess = NumDims > 0, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -128,12 +128,12 @@ struct TensorEvaluator, Device typedef internal::TensorBlockDescriptor TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - typedef typename TensorEvaluator::TensorBlockV2 + typedef typename TensorEvaluator::TensorBlock ArgTensorBlock; typedef typename internal::TensorMaterializedBlock - TensorBlockV2; + TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, @@ -245,15 +245,15 @@ struct TensorEvaluator, Device } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockV2ResourceRequirements getResourceRequirements() const { + internal::TensorBlockResourceRequirements getResourceRequirements() const { const size_t target_block_size = numext::maxi(1, m_device.lastLevelCacheSize() / sizeof(Scalar)); - return {internal::TensorBlockV2ShapeType::kSkewedInnerDims, + return {internal::TensorBlockShapeType::kSkewedInnerDims, target_block_size}; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { // TODO(ezhulenev): If underlying tensor expression supports and prefers // block evaluation we must use it. Currently we use coeff and packet @@ -322,8 +322,8 @@ struct TensorEvaluator, Device const Index inner_dim_size = it[effective_inner_dim].size; // Prepare storage for the materialized reverse result. - const typename TensorBlockV2::Storage block_storage = - TensorBlockV2::prepareStorage(desc, scratch); + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); CoeffReturnType* block_buffer = block_storage.data(); while (it[NumDims - 1].count < it[NumDims - 1].size) { @@ -433,7 +433,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -449,7 +449,7 @@ struct TensorEvaluator, Device> static const int PacketSize = PacketType::size; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index d8005d604..ee465dd0f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -99,7 +99,7 @@ struct TensorEvaluator, Device> { enum { IsAligned = false, PacketAccess = (PacketType::size > 1), - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, @@ -107,7 +107,7 @@ struct TensorEvaluator, Device> { }; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 42bca8172..1a6891ffd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -115,7 +115,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = (PacketType::size > 1), - BlockAccessV2 = TensorEvaluator::RawAccess, + BlockAccess = TensorEvaluator::RawAccess, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -130,7 +130,7 @@ struct TensorEvaluator, Device> typedef typename internal::TensorMaterializedBlock - TensorBlockV2; + TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, @@ -245,7 +245,7 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockV2ResourceRequirements getResourceRequirements() const { + internal::TensorBlockResourceRequirements getResourceRequirements() const { static const int inner_dim = Layout == static_cast(ColMajor) ? 0 : NumDims - 1; @@ -254,23 +254,23 @@ struct TensorEvaluator, Device> const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim; return {inner_dim_shuffled - ? internal::TensorBlockV2ShapeType::kUniformAllDims - : internal::TensorBlockV2ShapeType::kSkewedInnerDims, + ? internal::TensorBlockShapeType::kUniformAllDims + : internal::TensorBlockShapeType::kSkewedInnerDims, target_block_size}; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool root_of_expr_ast = false) const { assert(m_impl.data() != NULL); - typedef internal::TensorBlockIOV2 + typedef internal::TensorBlockIO TensorBlockIO; typedef typename TensorBlockIO::Dst TensorBlockIODst; typedef typename TensorBlockIO::Src TensorBlockIOSrc; - const typename TensorBlockV2::Storage block_storage = - TensorBlockV2::prepareStorage( + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage( desc, scratch, /*allow_strided_storage=*/root_of_expr_ast); typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides); @@ -380,7 +380,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = (PacketType::size > 1), - BlockAccessV2 = TensorEvaluator::RawAccess, + BlockAccess = TensorEvaluator::RawAccess, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, RawAccess = false @@ -414,12 +414,12 @@ struct TensorEvaluator, Device> } } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2( - const TensorBlockDesc& desc, const TensorBlockV2& block) { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { eigen_assert(this->m_impl.data() != NULL); - typedef internal::TensorBlockIOV2 + typedef internal::TensorBlockIO TensorBlockIO; typedef typename TensorBlockIO::Dst TensorBlockIODst; typedef typename TensorBlockIO::Src TensorBlockIOSrc; @@ -434,7 +434,7 @@ struct TensorEvaluator, Device> ScalarNoConst* buf = static_cast(mem); typedef internal::TensorBlockAssignment< - ScalarNoConst, NumDims, typename TensorBlockV2::XprType, Index> + ScalarNoConst, NumDims, typename TensorBlock::XprType, Index> TensorBlockAssignment; TensorBlockAssignment::Run( diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 8a7fcac23..d05f37532 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -114,7 +114,7 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -122,7 +122,7 @@ struct TensorEvaluator, Device> }; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h index 209d6fb3b..24d22c189 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -97,7 +97,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, @@ -105,7 +105,7 @@ struct TensorEvaluator, Device> }; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index a4c38f118..000ed5b41 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -183,7 +183,7 @@ struct TensorEvaluator, D enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccessV2 = false, + BlockAccess = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, @@ -191,7 +191,7 @@ struct TensorEvaluator, D }; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp index b56601ebd..33dc2535a 100644 --- a/unsupported/test/cxx11_tensor_block_access.cpp +++ b/unsupported/test/cxx11_tensor_block_access.cpp @@ -19,7 +19,7 @@ using Eigen::Tensor; using Eigen::Index; using Eigen::RowMajor; using Eigen::ColMajor; -using Eigen::internal::TensorBlockV2ShapeType; +using Eigen::internal::TensorBlockShapeType; template @@ -27,10 +27,10 @@ static const T& choose(int layout, const T& col, const T& row) { return layout == ColMajor ? col : row; } -static TensorBlockV2ShapeType RandomShape() { +static TensorBlockShapeType RandomShape() { return internal::random() - ? TensorBlockV2ShapeType::kUniformAllDims - : TensorBlockV2ShapeType::kSkewedInnerDims; + ? TensorBlockShapeType::kUniformAllDims + : TensorBlockShapeType::kSkewedInnerDims; } template @@ -67,13 +67,13 @@ static void Debug(DSizes dims) { template static void test_block_mapper_sanity() { - typedef internal::TensorBlockV2Mapper<2, Layout> TensorBlockMapper; + typedef internal::TensorBlockMapper<2, Layout> TensorBlockMapper; DSizes tensor_dims(100, 100); // Test uniform blocks. TensorBlockMapper uniform_block_mapper( - tensor_dims, {TensorBlockV2ShapeType::kUniformAllDims, 100}); + tensor_dims, {TensorBlockShapeType::kUniformAllDims, 100}); VERIFY_IS_EQUAL(uniform_block_mapper.blockCount(), 100); VERIFY_IS_EQUAL(uniform_block_mapper.blockTotalSize(), 100); @@ -85,7 +85,7 @@ static void test_block_mapper_sanity() // Test skewed to inner dims blocks. TensorBlockMapper skewed_block_mapper( - tensor_dims, {TensorBlockV2ShapeType::kSkewedInnerDims, 100}); + tensor_dims, {TensorBlockShapeType::kSkewedInnerDims, 100}); VERIFY_IS_EQUAL(skewed_block_mapper.blockCount(), 100); VERIFY_IS_EQUAL(skewed_block_mapper.blockTotalSize(), 100); @@ -121,7 +121,7 @@ static void UpdateCoeffSet( template static void test_block_mapper_maps_every_element() { - typedef internal::TensorBlockV2Mapper TensorBlockMapper; + typedef internal::TensorBlockMapper TensorBlockMapper; DSizes dims = RandomDims(); DSizes strides = internal::strides(dims); @@ -227,14 +227,14 @@ template static void test_uniform_block_shape() { typedef internal::TensorBlockDescriptor<5> TensorBlock; - typedef internal::TensorBlockV2Mapper<5, Layout> TensorBlockMapper; + typedef internal::TensorBlockMapper<5, Layout> TensorBlockMapper; { // Test shape 'UniformAllDims' with uniform 'max_coeff count'. DSizes dims(11, 5, 6, 17, 7); const Index max_coeff_count = 5 * 5 * 5 * 5 * 5; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims, + block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); for (int i = 0; i < 5; ++i) { @@ -249,7 +249,7 @@ static void test_uniform_block_shape() DSizes dims(11, 5, 6, 17, 7); const Index max_coeff_count = 7 * 5 * 5 * 5 * 5; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims, + block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(7, block.dimensions()[0]); @@ -261,7 +261,7 @@ static void test_uniform_block_shape() DSizes dims(11, 5, 6, 17, 7); const Index max_coeff_count = 5 * 5 * 5 * 5 * 6; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims, + block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(6, block.dimensions()[4]); @@ -277,7 +277,7 @@ static void test_uniform_block_shape() DSizes dims(11, 5, 6, 17, 7); const Index max_coeff_count = 11 * 5 * 5 * 5 * 5; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims, + block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(11, block.dimensions()[0]); @@ -289,7 +289,7 @@ static void test_uniform_block_shape() DSizes dims(11, 5, 6, 17, 7); const Index max_coeff_count = 5 * 5 * 5 * 5 * 7; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims, + block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(7, block.dimensions()[4]); @@ -305,7 +305,7 @@ static void test_uniform_block_shape() DSizes dims(7, 5, 6, 17, 7); const Index max_coeff_count = 7 * 5 * 6 * 7 * 5; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims, + block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(7, block.dimensions()[0]); @@ -318,7 +318,7 @@ static void test_uniform_block_shape() DSizes dims(7, 5, 6, 9, 7); const Index max_coeff_count = 5 * 5 * 5 * 6 * 7; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims, + block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(7, block.dimensions()[4]); @@ -334,7 +334,7 @@ static void test_uniform_block_shape() DSizes dims(7, 5, 6, 17, 7); const Index max_coeff_count = 7 * 5 * 6 * 17 * 7; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims, + block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(7, block.dimensions()[0]); @@ -347,7 +347,7 @@ static void test_uniform_block_shape() DSizes dims(7, 5, 6, 9, 7); const Index max_coeff_count = 7 * 5 * 6 * 9 * 7; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kUniformAllDims, + block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(7, block.dimensions()[4]); @@ -363,14 +363,14 @@ template static void test_skewed_inner_dim_block_shape() { typedef internal::TensorBlockDescriptor<5> TensorBlock; - typedef internal::TensorBlockV2Mapper<5, Layout> TensorBlockMapper; + typedef internal::TensorBlockMapper<5, Layout> TensorBlockMapper; // Test shape 'SkewedInnerDims' with partial allocation to inner-most dim. if (Layout == ColMajor) { DSizes dims(11, 5, 6, 17, 7); const Index max_coeff_count = 10 * 1 * 1 * 1 * 1; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims, + block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(10, block.dimensions()[0]); @@ -382,7 +382,7 @@ static void test_skewed_inner_dim_block_shape() DSizes dims(11, 5, 6, 17, 7); const Index max_coeff_count = 1 * 1 * 1 * 1 * 6; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims, + block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(6, block.dimensions()[4]); @@ -397,7 +397,7 @@ static void test_skewed_inner_dim_block_shape() DSizes dims(11, 5, 6, 17, 7); const Index max_coeff_count = 11 * 1 * 1 * 1 * 1; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims, + block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(11, block.dimensions()[0]); @@ -409,7 +409,7 @@ static void test_skewed_inner_dim_block_shape() DSizes dims(11, 5, 6, 17, 7); const Index max_coeff_count = 1 * 1 * 1 * 1 * 7; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims, + block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(7, block.dimensions()[4]); @@ -425,7 +425,7 @@ static void test_skewed_inner_dim_block_shape() DSizes dims(11, 5, 6, 17, 7); const Index max_coeff_count = 11 * 3 * 1 * 1 * 1; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims, + block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(11, block.dimensions()[0]); @@ -438,7 +438,7 @@ static void test_skewed_inner_dim_block_shape() DSizes dims(11, 5, 6, 17, 7); const Index max_coeff_count = 1 * 1 * 1 * 15 * 7; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims, + block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(7, block.dimensions()[4]); @@ -455,7 +455,7 @@ static void test_skewed_inner_dim_block_shape() DSizes dims(11, 5, 6, 17, 7); const Index max_coeff_count = 11 * 5 * 5 * 1 * 1; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims, + block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(11, block.dimensions()[0]); @@ -469,7 +469,7 @@ static void test_skewed_inner_dim_block_shape() DSizes dims(11, 5, 6, 17, 7); const Index max_coeff_count = 1 * 1 * 5 * 17 * 7; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims, + block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(7, block.dimensions()[4]); @@ -486,7 +486,7 @@ static void test_skewed_inner_dim_block_shape() DSizes dims(11, 5, 6, 17, 7); const Index max_coeff_count = 11 * 5 * 6 * 17 * 7; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims, + block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(11, block.dimensions()[0]); @@ -499,7 +499,7 @@ static void test_skewed_inner_dim_block_shape() DSizes dims(11, 5, 6, 17, 7); const Index max_coeff_count = 11 * 5 * 6 * 17 * 7; TensorBlockMapper - block_mapper(dims, {TensorBlockV2ShapeType::kSkewedInnerDims, + block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count}); TensorBlock block = block_mapper.blockDescriptor(0); VERIFY_IS_EQUAL(7, block.dimensions()[4]); @@ -512,7 +512,7 @@ static void test_skewed_inner_dim_block_shape() } template -static void test_empty_dims(const internal::TensorBlockV2ShapeType block_shape) +static void test_empty_dims(const internal::TensorBlockShapeType block_shape) { // Test blocking of tensors with zero dimensions: // - we must not crash on asserts and divisions by zero @@ -520,7 +520,7 @@ static void test_empty_dims(const internal::TensorBlockV2ShapeType block_shape) // (recipe for overflows/underflows, divisions by zero and NaNs later) // - total block count must be zero { - typedef internal::TensorBlockV2Mapper<1, Layout> TensorBlockMapper; + typedef internal::TensorBlockMapper<1, Layout> TensorBlockMapper; DSizes dims(0); for (size_t max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) { @@ -531,7 +531,7 @@ static void test_empty_dims(const internal::TensorBlockV2ShapeType block_shape) } { - typedef internal::TensorBlockV2Mapper<2, Layout> TensorBlockMapper; + typedef internal::TensorBlockMapper<2, Layout> TensorBlockMapper; for (int dim1 = 0; dim1 < 3; ++dim1) { for (int dim2 = 0; dim2 < 3; ++dim2) { @@ -573,8 +573,8 @@ EIGEN_DECLARE_TEST(cxx11_tensor_block_access) { TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element); TEST_LAYOUTS(test_uniform_block_shape); TEST_LAYOUTS(test_skewed_inner_dim_block_shape); - TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockV2ShapeType::kUniformAllDims); - TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockV2ShapeType::kSkewedInnerDims); + TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims); + TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims); } #undef TEST_LAYOUTS diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp index 700e84a19..4a785dcdc 100644 --- a/unsupported/test/cxx11_tensor_block_eval.cpp +++ b/unsupported/test/cxx11_tensor_block_eval.cpp @@ -61,9 +61,9 @@ static TensorBlockParams RandomBlock(DSizes dims, template static TensorBlockParams SkewedInnerBlock( DSizes dims) { - using BlockMapper = internal::TensorBlockV2Mapper; + using BlockMapper = internal::TensorBlockMapper; BlockMapper block_mapper(dims, - {internal::TensorBlockV2ShapeType::kSkewedInnerDims, + {internal::TensorBlockShapeType::kSkewedInnerDims, internal::random(1, dims.TotalSize())}); Index total_blocks = block_mapper.blockCount(); @@ -158,7 +158,7 @@ static void VerifyBlockEvaluator(Expression expr, GenBlockParams gen_block) { } const bool root_of_expr = internal::random(); - auto tensor_block = eval.blockV2(block_params.desc, scratch, root_of_expr); + auto tensor_block = eval.block(block_params.desc, scratch, root_of_expr); if (tensor_block.kind() == internal::TensorBlockKind::kMaterializedInOutput) { // Copy data from destination buffer. @@ -596,7 +596,7 @@ static void VerifyBlockAssignment(Tensor& tensor, tensor.setZero(); // Use evaluator to write block into a tensor. - eval.writeBlockV2(block_params.desc, blk); + eval.writeBlock(block_params.desc, blk); // Make a copy of the result after assignment. Tensor block_assigned = tensor; diff --git a/unsupported/test/cxx11_tensor_block_io.cpp b/unsupported/test/cxx11_tensor_block_io.cpp index 6f318d9fe..25584433e 100644 --- a/unsupported/test/cxx11_tensor_block_io.cpp +++ b/unsupported/test/cxx11_tensor_block_io.cpp @@ -22,10 +22,10 @@ static DSizes RandomDims(Index min, Index max) { return DSizes(dims); } -static internal::TensorBlockV2ShapeType RandomBlockShape() { +static internal::TensorBlockShapeType RandomBlockShape() { return internal::random() - ? internal::TensorBlockV2ShapeType::kUniformAllDims - : internal::TensorBlockV2ShapeType::kSkewedInnerDims; + ? internal::TensorBlockShapeType::kUniformAllDims + : internal::TensorBlockShapeType::kSkewedInnerDims; } template @@ -60,7 +60,7 @@ static Index GetInputIndex(Index output_index, template static void test_block_io_copy_data_from_source_to_target() { - using TensorBlockIO = internal::TensorBlockIOV2; + using TensorBlockIO = internal::TensorBlockIO; using IODst = typename TensorBlockIO::Dst; using IOSrc = typename TensorBlockIO::Src; @@ -74,7 +74,7 @@ static void test_block_io_copy_data_from_source_to_target() { // Construct a tensor block mapper. using TensorBlockMapper = - internal::TensorBlockV2Mapper; + internal::TensorBlockMapper; TensorBlockMapper block_mapper(dims, {RandomBlockShape(), RandomTargetBlockSize(dims)}); @@ -145,7 +145,7 @@ static void test_block_io_copy_using_reordered_dimensions() { // Construct a tensor block mapper. // NOTE: Tensor block mapper works with shuffled dimensions. using TensorBlockMapper = - internal::TensorBlockV2Mapper; + internal::TensorBlockMapper; TensorBlockMapper block_mapper(output_tensor_dims, {RandomBlockShape(), RandomTargetBlockSize(output_tensor_dims)}); @@ -169,7 +169,7 @@ static void test_block_io_copy_using_reordered_dimensions() { // NOTE: Block dimensions are in the same order as output dimensions. - using TensorBlockIO = internal::TensorBlockIOV2; + using TensorBlockIO = internal::TensorBlockIO; using IODst = typename TensorBlockIO::Dst; using IOSrc = typename TensorBlockIO::Src; @@ -181,7 +181,7 @@ static void test_block_io_copy_using_reordered_dimensions() { IODst dst(blk_dims, blk_strides, block_data, 0); IOSrc src(input_strides, input_data, first_coeff_index); - // TODO(ezhulenev): Remove when fully switched to TensorBlockV2. + // TODO(ezhulenev): Remove when fully switched to TensorBlock. DSizes dim_map; for (int j = 0; j < NumDims; ++j) dim_map[j] = static_cast(output_to_input_dim_map[j]); @@ -199,7 +199,7 @@ static void test_block_io_copy_using_reordered_dimensions() { IODst dst(dst_dims, input_strides, output_data, first_coeff_index); IOSrc src(blk_strides, block_data, 0); - // TODO(ezhulenev): Remove when fully switched to TensorBlockV2. + // TODO(ezhulenev): Remove when fully switched to TensorBlock. DSizes dim_map; for (int j = 0; j < NumDims; ++j) dim_map[j] = static_cast(input_to_output_dim_map[j]); @@ -235,7 +235,7 @@ static void test_block_io_copy_using_reordered_dimensions_do_not_squeeze() { float* tensor_data = tensor.data(); float* block_data = block.data(); - using TensorBlockIO = internal::TensorBlockIOV2; + using TensorBlockIO = internal::TensorBlockIO; using IODst = typename TensorBlockIO::Dst; using IOSrc = typename TensorBlockIO::Src; @@ -283,7 +283,7 @@ static void test_block_io_copy_using_reordered_dimensions_squeeze() { float* tensor_data = tensor.data(); float* block_data = block.data(); - using TensorBlockIO = internal::TensorBlockIOV2; + using TensorBlockIO = internal::TensorBlockIO; using IODst = typename TensorBlockIO::Dst; using IOSrc = typename TensorBlockIO::Src; @@ -334,7 +334,7 @@ static void test_block_io_zero_stride() { Tensor output(output_tensor_dims); output.setRandom(); - using TensorBlockIO = internal::TensorBlockIOV2; + using TensorBlockIO = internal::TensorBlockIO; using IODst = typename TensorBlockIO::Dst; using IOSrc = typename TensorBlockIO::Src; @@ -360,7 +360,7 @@ static void test_block_io_zero_stride() { template static void test_block_io_squeeze_ones() { - using TensorBlockIO = internal::TensorBlockIOV2; + using TensorBlockIO = internal::TensorBlockIO; using IODst = typename TensorBlockIO::Dst; using IOSrc = typename TensorBlockIO::Src; -- cgit v1.2.3