// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H namespace Eigen { namespace internal { // -------------------------------------------------------------------------- // // Forward declarations for templates defined below. template class TensorBlockIO; // -------------------------------------------------------------------------- // // Helper function to compute strides for densely stored buffer of given // dimensions. // TODO(ezhulenev): We compute strides 1000 times in different evaluators, use // this function instead everywhere. template EIGEN_ALWAYS_INLINE DSizes strides( const DSizes& dimensions) { DSizes strides; if (NumDims == 0) return strides; // TODO(ezhulenev): Use templates to unroll this loop (similar to // h_array_reduce in CXX11meta.h)? Benchmark it. if (static_cast(Layout) == static_cast(ColMajor)) { strides[0] = 1; for (int i = 1; i < NumDims; ++i) { strides[i] = strides[i - 1] * dimensions[i - 1]; } } else { strides[NumDims - 1] = 1; for (int i = NumDims - 2; i >= 0; --i) { strides[i] = strides[i + 1] * dimensions[i + 1]; } } return strides; } template EIGEN_ALWAYS_INLINE DSizes strides( const Eigen::array& dimensions) { return strides(DSizes(dimensions)); } template EIGEN_STRONG_INLINE DSizes strides( const Sizes& sizes) { return strides(DSizes(sizes)); } // -------------------------------------------------------------------------- // // Tensor block shape type defines what are the shape preference for the blocks // extracted from the larger tensor. // // Example: blocks of 100 elements from the large 100x100 tensor: // - tensor: 100x100 // - target_block_size: 100 // // TensorBlockShapeType: // - kUniformAllDims: 100 blocks of size 10x10 // - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column // or row major layout) enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims }; struct TensorBlockResourceRequirements { TensorBlockShapeType shape_type; // target block shape size_t size; // target block size TensorOpCost cost_per_coeff; // cost of computing a single block element #ifdef EIGEN_HIPCC // For HIPCC, we need to explicitly declare as a "device fun", the constructor // which is implicitly invoked in the "merge" / "any" routines. else HIPCC // errors out complaining about the lack of a matching constructor EIGEN_DEVICE_FUNC TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_, TensorOpCost cost_) : shape_type(shape_type_), size(size_), cost_per_coeff(cost_) {} #endif template EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( TensorBlockShapeType shape_type, size_t size_in_bytes, TensorOpCost cost) { const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar)); return {shape_type, size, cost}; } template EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( TensorBlockShapeType shape_type, size_t size_in_bytes) { // This default cost per coefficient is valid for most materialized tensor // block evaluation implementations, because they typically just read // coefficients from the underlying tensor storage, and write to the tensor // block buffer (scratch or destination memory, reads and writes have linear // access pattern). We ignore the fixed cost of block evaluation, because in // practice it should negligible. // // Lazy block evaluation adds the cost of calling a functor for each // coefficient. // // All non-trivial block evaluation implementations must provide their own // cost approximation (e.g. shuffling inner dimension has a much higher cost // because it reads memory randomly, although the total number of moved // bytes is the same). return withShapeAndSize(shape_type, size_in_bytes, {/*bytes_loaded=*/sizeof(Scalar), /*bytes_stored=*/sizeof(Scalar), /*compute_cycles=*/0}); } template EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed( size_t size_in_bytes) { return withShapeAndSize(TensorBlockShapeType::kSkewedInnerDims, size_in_bytes); } template EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform( size_t size_in_bytes) { return withShapeAndSize(TensorBlockShapeType::kUniformAllDims, size_in_bytes); } EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements merge(const TensorBlockResourceRequirements& lhs, const TensorBlockResourceRequirements& rhs) { return {merge(lhs.shape_type, rhs.shape_type), // shape_type merge(lhs.size, rhs.size), // size merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff } EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff( TensorOpCost cost) { cost_per_coeff += cost; return *this; } // This is a resource requirement that should be returned from expressions // that do not have any block evaluation preference (e.g. default tensor // expression with raw buffer access). EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() { return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}}; } private: using Requirements = TensorBlockResourceRequirements; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) { return numext::maxi(lhs_size, rhs_size); } EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) { return (lhs == TensorBlockShapeType::kSkewedInnerDims || rhs == TensorBlockShapeType::kSkewedInnerDims) ? TensorBlockShapeType::kSkewedInnerDims : TensorBlockShapeType::kUniformAllDims; } EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, TensorOpCost rhs_cost) { return lhs_cost + rhs_cost; } }; // -------------------------------------------------------------------------- // // TensorBlockDescriptor specifies a block offset within a tensor and the block // sizes along each of the tensor dimensions. template class TensorBlockDescriptor { public: typedef DSizes Dimensions; // If we evaluate a Tensor assignment, and expression on the left, already has // a memory buffer, then we might do performance optimization, and evaluate // the root expression directly into the final output memory. Some time it's // possible to reuse it for materializing subexpressions inside an expression // tree, to to avoid dynamic memory allocation. // // The pointer type of the underlying storage is erased, because passing // Scalar type through all the expression evaluation layers is way too many // templates. In practice destination buffer type should always match the // evaluated expression scalar type. class DestinationBuffer { public: enum DestinationBufferKind : int { // The above explicit specification of "int" as the enum basetype is // needed to get around a HIPCC link error ("the field type is not // amp-compatible") // which is issued for class members with the enum type. // TODO(rocm): // remove the "int" basetype once HIPCC has been fixed to not error out // in the above scenario. // Destination buffer is not defined (`m_data` == nullptr). kEmpty, // Tensor block defined by an owning tensor block descriptor can fit // contiguously into the destination buffer. In this case it's safe to // materialize tensor block in the destination buffer, wrap it in a // TensorMap, and use to build Eigen expression on top of it. kContiguous, // Destination buffer strides do not match strides of the contiguously // stored block, and it's impossible to define a TensorMap over this // buffer. However if we are evaluating a root of an expression tree, we // still can materialize an output into this destination, because we can // guarantee that no one will ever access it through block API. // // In theory it is possible to build valid TensorStriding // expression on top of this destination buffer, however it has // inefficient coeff/packet access, and defeats the purpose of fast block // evaluation API. kStrided }; template Scalar* data() const { eigen_assert(m_data_type_size == sizeof(Scalar)); return static_cast(m_data); } const Dimensions& strides() const { return m_strides; } const DestinationBufferKind& kind() const { return m_kind; } private: friend class TensorBlockDescriptor; DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {} template DestinationBuffer(Scalar* data, const Dimensions& strides, DestinationBufferKind kind) : m_data(static_cast(data)), m_data_type_size(sizeof(Scalar)), m_strides(strides), m_kind(kind) {} template static DestinationBuffer make(const TensorBlockDescriptor& desc, Scalar* data, const Dimensions& strides) { return DestinationBuffer(data, strides, kind(desc, strides)); } template static DestinationBufferKind kind(const TensorBlockDescriptor& desc, const Dimensions& strides) { const Dimensions& desc_dims = desc.dimensions(); const Dimensions& desc_strides = internal::strides(desc_dims); for (int i = 0; i < NumDims; ++i) { if (desc_dims[i] == 1) continue; if (desc_strides[i] != strides[i]) return kStrided; } return kContiguous; } // Storage pointer is type erased, to reduce template bloat, but we still // keep the size of the underlying element type for error checking. void* m_data; size_t m_data_type_size; // Destination buffer dimensions always match the dimensions of a tensor // block descriptor it belongs to, however strides might be different. Dimensions m_strides; DestinationBufferKind m_kind; }; TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, const DestinationBuffer& destination) : m_offset(offset), m_dimensions(dimensions), m_destination(destination) {} TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions) : m_offset(offset), m_dimensions(dimensions), m_destination(DestinationBuffer()) {} IndexType offset() const { return m_offset; } const Dimensions& dimensions() const { return m_dimensions; } IndexType dimension(int index) const { return m_dimensions[index]; } IndexType size() const { return array_prod(m_dimensions); } const DestinationBuffer& destination() const { return m_destination; } template void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) { eigen_assert(dst_base != NULL); m_destination = DestinationBuffer::template make(*this, dst_base, dst_strides); } template void AddDestinationBuffer( Scalar* dst_base, const DSizes& dst_strides) { // DSizes constructor will do index type promotion if it's safe. AddDestinationBuffer(dst_base, Dimensions(dst_strides)); } TensorBlockDescriptor& DropDestinationBuffer() { m_destination.m_data = NULL; m_destination.m_kind = DestinationBuffer::kEmpty; return *this; } bool HasDestinationBuffer() const { return m_destination.kind() != DestinationBuffer::kEmpty; } // Returns a copy of `*this` with updated offset. TensorBlockDescriptor WithOffset(IndexType offset) const { return TensorBlockDescriptor(offset, m_dimensions, m_destination); } private: // Offset and dimensions are immutable after construction. Block descriptor // can only be mutated by adding or dropping destination. const IndexType m_offset; const Dimensions m_dimensions; DestinationBuffer m_destination; }; // -------------------------------------------------------------------------- // // TensorBlockMapper is responsible for iterating over the blocks of a tensor. template class TensorBlockMapper { typedef TensorBlockDescriptor BlockDescriptor; public: typedef DSizes Dimensions; TensorBlockMapper() = default; TensorBlockMapper(const DSizes& dimensions, const TensorBlockResourceRequirements& requirements) : m_tensor_dimensions(dimensions), m_requirements(requirements) { // Compute block dimensions and the total number of blocks. InitializeBlockDimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const { return m_total_block_count; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const { return m_block_dimensions.TotalSize(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& blockDimensions() const { return m_block_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const { static const bool isColMajor = Layout == static_cast(ColMajor); IndexType offset = 0; DSizes dimensions; if (NumDims == 0) return BlockDescriptor(offset, dimensions); // Iterate outer -> inner dimensions. for (int i = NumDims - 1; i >= 0; --i) { const int dim = isColMajor ? i : NumDims - i - 1; const IndexType idx = block_index / m_block_strides[dim]; block_index -= idx * m_block_strides[dim]; const IndexType coord = idx * m_block_dimensions[dim]; dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, m_block_dimensions[dim]); offset += coord * m_tensor_strides[dim]; } return {offset, dimensions}; } private: void InitializeBlockDimensions() { // Requested block shape and size. const TensorBlockShapeType shape_type = m_requirements.shape_type; IndexType target_block_size = numext::maxi(1, static_cast(m_requirements.size)); IndexType tensor_size = m_tensor_dimensions.TotalSize(); // Corner case: one of the dimensions is zero. Logic below is too complex // to handle this case on a general basis, just use unit block size. // Note: we must not yield blocks with zero dimensions (recipe for // overflows/underflows, divisions by zero and NaNs later). if (tensor_size == 0) { for (int i = 0; i < NumDims; ++i) { m_block_dimensions[i] = 1; } m_total_block_count = 0; return; } // If tensor fits into a target block size, evaluate it as a single block. if (tensor_size <= target_block_size) { m_block_dimensions = m_tensor_dimensions; m_total_block_count = 1; // The only valid block index is `0`, and in this case we do not need // to compute real strides for tensor or blocks (see blockDescriptor). for (int i = 0; i < NumDims; ++i) { m_tensor_strides[i] = 0; m_block_strides[i] = 1; } return; } static const bool isColMajor = Layout == static_cast(ColMajor); // Block shape skewed towards inner dimension. if (shape_type == TensorBlockShapeType::kSkewedInnerDims) { IndexType coeff_to_allocate = target_block_size; for (int i = 0; i < NumDims; ++i) { const int dim = isColMajor ? i : NumDims - i - 1; m_block_dimensions[dim] = numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]); coeff_to_allocate = divup( coeff_to_allocate, numext::maxi(static_cast(1), m_block_dimensions[dim])); } eigen_assert(coeff_to_allocate == 1); } else if (shape_type == TensorBlockShapeType::kUniformAllDims) { // Tensor will not fit within 'target_block_size' budget: calculate tensor // block dimension sizes based on "square" dimension size target. const IndexType dim_size_target = convert_index( std::pow(static_cast(target_block_size), 1.0f / static_cast(m_block_dimensions.rank()))); for (int i = 0; i < NumDims; ++i) { // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it // a multiple of the packet size. Note that reducing // 'block_dim_size' in this manner can increase the number of // blocks, and so will amplify any per-block overhead. m_block_dimensions[i] = numext::mini(dim_size_target, m_tensor_dimensions[i]); } // Add any un-allocated coefficients to inner dimension(s). IndexType total_size = m_block_dimensions.TotalSize(); for (int i = 0; i < NumDims; ++i) { const int dim = isColMajor ? i : NumDims - i - 1; if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) { const IndexType total_size_other_dims = total_size / m_block_dimensions[dim]; const IndexType alloc_avail = divup(target_block_size, total_size_other_dims); if (alloc_avail == m_block_dimensions[dim]) { // Insufficient excess coefficients to allocate. break; } m_block_dimensions[dim] = numext::mini(m_tensor_dimensions[dim], alloc_avail); total_size = total_size_other_dims * m_block_dimensions[dim]; } } } else { eigen_assert(false); // unknown block shape } eigen_assert(m_block_dimensions.TotalSize() >= numext::mini(target_block_size, m_tensor_dimensions.TotalSize())); // Calculate block counts by dimension and total block count. DSizes block_count; for (int i = 0; i < NumDims; ++i) { block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]); } m_total_block_count = array_prod(block_count); // Calculate block strides (used for enumerating blocks). m_tensor_strides = strides(m_tensor_dimensions); m_block_strides = strides(block_count); } DSizes m_tensor_dimensions; TensorBlockResourceRequirements m_requirements; DSizes m_block_dimensions; IndexType m_total_block_count; DSizes m_tensor_strides; DSizes m_block_strides; }; // -------------------------------------------------------------------------- // // TensorBlockScratchAllocator is responsible for allocating temporary buffers // for block evaluation (output or input block materialization). Given that // Eigen expression traversal order is deterministic, all temporary allocations // are happening in the same order, and usually have exactly the same size. // Scratch allocator keeps a trace of all dynamic allocations, and after the // first block evaluation is completed, we should be able to reuse all the // temporary buffers for the next block evaluation. template class TensorBlockScratchAllocator { public: explicit TensorBlockScratchAllocator(const Device& device) : m_device(device), m_allocation_index(0) {} ~TensorBlockScratchAllocator() { for (size_t i = 0; i < m_allocations.size(); ++i) { m_device.deallocate(m_allocations[i].ptr); } } void* allocate(size_t size) { // TODO(ezhulenev): Remove when replaced with inlined vector. if (m_allocations.capacity() == 0) m_allocations.reserve(8); // Check if we already have an existing allocation att current index. const int num_allocations = static_cast(m_allocations.size()); const bool has_allocation = m_allocation_index < num_allocations; // Allocation index can't be larger than the number of allocations. eigen_assert(m_allocation_index <= num_allocations); // If we have existing allocation, and its size is larger or equal to // requested size, we do nothing. // If current allocation can't fit requested size, we deallocate it, and // replace with a larger allocation. if (has_allocation && m_allocations[m_allocation_index].size < size) { m_device.deallocate(m_allocations[m_allocation_index].ptr); m_allocations[m_allocation_index].ptr = m_device.allocate(size); m_allocations[m_allocation_index].size = size; } // Make a new allocation if we don't have and existing one. if (!has_allocation) { Allocation allocation; allocation.ptr = m_device.allocate(size); allocation.size = size; m_allocations.push_back(allocation); } eigen_assert(m_allocations[m_allocation_index].ptr != NULL); eigen_assert(m_allocations[m_allocation_index].size >= size); return m_allocations[m_allocation_index++].ptr; } void reset() { m_allocation_index = 0; } private: struct Allocation { void* ptr; size_t size; }; const Device& m_device; int m_allocation_index; // TODO(ezhulenev): This should be an inlined vector. std::vector m_allocations; }; // -------------------------------------------------------------------------- // // TensorBlockKind represents all possible block kinds, that can be produced by // TensorEvaluator::evalBlock function. enum TensorBlockKind { // Tensor block that is a lazy expression that must be assigned to a // destination using TensorBlockAssign. kExpr, // Tensor block that is a view into a memory buffer owned by an underlying // Tensor expression (e.g. it can be a view into a Tensor buffer). kView, // Tensor block that was materialized in a scratch memory buffer, allocated // with TensorBlockScratchAllocator. This block must be copied to a // destination, similar to a block of `kExpr` type. kMaterializedInScratch, // Tensor block that was materialized directly into the final output memory // buffer. For example if the left side of an assignment is a Tensor, we can // directly materialize the block in the destination memory. // // If strides in the output buffer do not match tensor block strides, the // Tensor expression will be invalid, and should not be used by // TensorBlockAssign or for constructing another block expression. kMaterializedInOutput }; // -------------------------------------------------------------------------- // // TensorBlockNotImplemented should be used to defined TensorBlock typedef in // TensorEvaluators that do not support block evaluation. class TensorBlockNotImplemented { public: typedef void XprType; }; // -------------------------------------------------------------------------- // // XprScalar extracts Scalar type from the Eigen expressions (if expression type // is not void). It's required to be able to define lazy block expression for // argument types, that do not support block evaluation. template struct XprScalar { typedef typename XprType::Scalar type; }; template <> struct XprScalar { typedef void type; }; // -------------------------------------------------------------------------- // // TensorMaterializedBlock is a fully evaluated block of the original tensor, // and XprType is just a TensorMap over the data. This block type is typically // used to materialize blocks of tensor expressions, that can't be efficiently // represented as lazy Tensor expressions with fast coeff/packet operations, // e.g. we materialize all broadcasts into evaluated blocks. // // TensorMaterializedBlock does not own its memory buffer, it's either a memory // buffer that backs the original expression (e.g. block is just a view into a // Tensor), or a memory buffer allocated with scratch allocator, and in this // case the scratch allocator will deallocate it at the end of block based // expression execution. // // If the block was evaluated directly into the output buffer, and strides in // the output buffer do not match block strides, the TensorMap expression will // be invalid, and should never be used in block assignment or any other tensor // expression. template class TensorMaterializedBlock { public: typedef DSizes Dimensions; typedef TensorMap > XprType; TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, const Dimensions& dimensions, bool valid_expr = true) : m_kind(kind), m_data(data), m_dimensions(dimensions), m_expr(m_data, m_dimensions), m_valid_expr(valid_expr) { eigen_assert(m_kind == internal::TensorBlockKind::kView || m_kind == internal::TensorBlockKind::kMaterializedInScratch || m_kind == internal::TensorBlockKind::kMaterializedInOutput); } TensorBlockKind kind() const { return m_kind; } // NOTE(ezhulenev): Returning XprType by value like in other block types // causes asan failures. The theory is that XprType::Nested doesn't work // properly for TensorMap. const XprType& expr() const { eigen_assert(m_valid_expr); return m_expr; } const Scalar* data() const { return m_data; } void cleanup() {} typedef internal::TensorBlockDescriptor TensorBlockDesc; // TensorMaterializedBlock can be backed by different types of storage: // // (1) Contiguous block of memory allocated with scratch allocator. // (2) Contiguous block of memory reused from tensor block descriptor // destination buffer. // (3) Strided block of memory reused from tensor block descriptor // destination buffer. // class Storage { public: Scalar* data() const { return m_data; } const Dimensions& dimensions() const { return m_dimensions; } const Dimensions& strides() const { return m_strides; } TensorMaterializedBlock AsTensorMaterializedBlock() const { return TensorMaterializedBlock( m_materialized_in_output ? internal::TensorBlockKind::kMaterializedInOutput : internal::TensorBlockKind::kMaterializedInScratch, m_data, m_dimensions, !m_strided_storage); } private: friend class TensorMaterializedBlock; Storage(Scalar* data, const Dimensions& dimensions, const Dimensions& strides, bool materialized_in_output, bool strided_storage) : m_data(data), m_dimensions(dimensions), m_strides(strides), m_materialized_in_output(materialized_in_output), m_strided_storage(strided_storage) {} Scalar* m_data; Dimensions m_dimensions; Dimensions m_strides; bool m_materialized_in_output; bool m_strided_storage; }; // Creates a storage for materialized block either from the block descriptor // destination buffer, or allocates a new buffer with scratch allocator. template EIGEN_STRONG_INLINE static Storage prepareStorage( TensorBlockDesc& desc, TensorBlockScratch& scratch, bool allow_strided_storage = false) { // Try to reuse destination as an output block buffer. typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer; if (desc.destination().kind() == DestinationBuffer::kContiguous) { Scalar* buffer = desc.destination().template data(); desc.DropDestinationBuffer(); return Storage(buffer, desc.dimensions(), internal::strides(desc.dimensions()), /*materialized_in_output=*/true, /*strided_storage=*/false); } else if (desc.destination().kind() == DestinationBuffer::kStrided && allow_strided_storage) { Scalar* buffer = desc.destination().template data(); desc.DropDestinationBuffer(); return Storage(buffer, desc.dimensions(), desc.destination().strides(), /*materialized_in_output=*/true, /*strided_storage=*/true); } else { void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); return Storage(static_cast(mem), desc.dimensions(), internal::strides(desc.dimensions()), /*materialized_in_output=*/false, /*strided_storage=*/false); } } // Creates a materialized block for the given descriptor from a memory buffer. template EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize( const Scalar* data, const DataDimensions& data_dims, TensorBlockDesc& desc, TensorBlockScratch& scratch) { eigen_assert(array_size::value == desc.dimensions().size()); // If a tensor block dimensions covers a contiguous block of the underlying // memory, we can skip block buffer memory allocation, and construct a block // from existing `data` memory buffer. // // Example: (RowMajor layout) // data_dims: [11, 12, 13, 14] // desc.dimensions(): [1, 1, 3, 14] // // In this case we can construct a TensorBlock starting at // `data + desc.offset()`, with a `desc.dimensions()` block sizes. static const bool is_col_major = Layout == ColMajor; // Find out how many inner dimensions have a matching size. int num_matching_inner_dims = 0; for (int i = 0; i < NumDims; ++i) { int dim = is_col_major ? i : NumDims - i - 1; if (data_dims[dim] != desc.dimensions()[dim]) break; ++num_matching_inner_dims; } // All the outer dimensions must be of size `1`, except a single dimension // before the matching inner dimension (`3` in the example above). bool can_use_direct_access = true; for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) { int dim = is_col_major ? i : NumDims - i - 1; if (desc.dimension(dim) != 1) { can_use_direct_access = false; break; } } if (can_use_direct_access) { const Scalar* block_start = data + desc.offset(); return TensorMaterializedBlock(internal::TensorBlockKind::kView, block_start, desc.dimensions()); } else { // Reuse destination buffer or allocate new buffer with scratch allocator. const Storage storage = prepareStorage(desc, scratch); typedef internal::TensorBlockIO TensorBlockIO; typedef typename TensorBlockIO::Dst TensorBlockIODst; typedef typename TensorBlockIO::Src TensorBlockIOSrc; TensorBlockIOSrc src(internal::strides(Dimensions(data_dims)), data, desc.offset()); TensorBlockIODst dst(storage.dimensions(), storage.strides(), storage.data()); TensorBlockIO::Copy(dst, src); return storage.AsTensorMaterializedBlock(); } } private: TensorBlockKind m_kind; const Scalar* m_data; Dimensions m_dimensions; XprType m_expr; bool m_valid_expr; }; // -------------------------------------------------------------------------- // // TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp // functor to the blocks produced by the underlying Tensor expression. template class TensorCwiseUnaryBlock { static const bool NoArgBlockAccess = internal::is_void::value; public: typedef typename conditional< NoArgBlockAccess, void, TensorCwiseUnaryOp >:: type XprType; typedef typename XprScalar::type Scalar; TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor) : m_arg_block(arg_block), m_functor(functor) {} TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } XprType expr() const { return XprType(m_arg_block.expr(), m_functor); } const Scalar* data() const { return NULL; } void cleanup() { m_arg_block.cleanup(); } private: ArgTensorBlock m_arg_block; UnaryOp m_functor; }; // -------------------------------------------------------------------------- // // TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp // functor to the blocks produced by the underlying Tensor expression. template class TensorCwiseBinaryBlock { static const bool NoArgBlockAccess = internal::is_void::value || internal::is_void::value; public: typedef typename conditional< NoArgBlockAccess, void, TensorCwiseBinaryOp >::type XprType; typedef typename XprScalar::type Scalar; TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, const RhsTensorBlock& right_block, const BinaryOp& functor) : m_left_block(left_block), m_right_block(right_block), m_functor(functor) {} TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } XprType expr() const { return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); } const Scalar* data() const { return NULL; } void cleanup() { m_left_block.cleanup(); m_right_block.cleanup(); } private: LhsTensorBlock m_left_block; RhsTensorBlock m_right_block; BinaryOp m_functor; }; // -------------------------------------------------------------------------- // // TensorUnaryExprBlock is a lazy tensor expression block that can construct // an arbitrary tensor expression from a block of the underlying type (this is a // generalization of the TensorCwiseUnaryBlock for arbitrary expressions). template class TensorUnaryExprBlock { typedef typename ArgTensorBlock::XprType ArgXprType; static const bool NoArgBlockAccess = internal::is_void::value; public: typedef typename conditional< NoArgBlockAccess, void, typename BlockFactory::template XprType::type>::type XprType; typedef typename XprScalar::type Scalar; TensorUnaryExprBlock(const ArgTensorBlock& arg_block, const BlockFactory& factory) : m_arg_block(arg_block), m_factory(factory) {} TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } XprType expr() const { return m_factory.expr(m_arg_block.expr()); } const Scalar* data() const { return NULL; } void cleanup() { m_arg_block.cleanup(); } private: ArgTensorBlock m_arg_block; BlockFactory m_factory; }; // -------------------------------------------------------------------------- // // TensorTernaryExprBlock is a lazy tensor expression block that can construct // an arbitrary tensor expression from three blocks of the underlying type. template class TensorTernaryExprBlock { typedef typename Arg1TensorBlock::XprType Arg1XprType; typedef typename Arg2TensorBlock::XprType Arg2XprType; typedef typename Arg3TensorBlock::XprType Arg3XprType; static const bool NoArgBlockAccess = internal::is_void::value || internal::is_void::value || internal::is_void::value; public: typedef typename conditional< NoArgBlockAccess, void, typename BlockFactory::template XprType::type>::type XprType; typedef typename XprScalar::type Scalar; TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, const Arg2TensorBlock& arg2_block, const Arg3TensorBlock& arg3_block, const BlockFactory& factory) : m_arg1_block(arg1_block), m_arg2_block(arg2_block), m_arg3_block(arg3_block), m_factory(factory) {} TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } XprType expr() const { return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), m_arg3_block.expr()); } const Scalar* data() const { return NULL; } void cleanup() { m_arg1_block.cleanup(); m_arg2_block.cleanup(); m_arg3_block.cleanup(); } private: Arg1TensorBlock m_arg1_block; Arg2TensorBlock m_arg2_block; Arg3TensorBlock m_arg3_block; BlockFactory m_factory; }; // -------------------------------------------------------------------------- // // StridedLinearBufferCopy provides a method to copy data between two linear // buffers with different strides, with optimized paths for scatter/gather. template class StridedLinearBufferCopy { typedef typename packet_traits::type Packet; enum { Vectorizable = packet_traits::Vectorizable, PacketSize = packet_traits::size }; public: // Specifying linear copy kind statically gives ~30% speedup for small sizes. enum class Kind { Linear = 0, // src_stride == 1 && dst_stride == 1 Scatter = 1, // src_stride == 1 && dst_stride != 1 FillLinear = 2, // src_stride == 0 && dst_stride == 1 FillScatter = 3, // src_stride == 0 && dst_stride != 1 Gather = 4, // dst_stride == 1 Random = 5 // everything else }; struct Dst { Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {} IndexType offset; IndexType stride; Scalar* data; }; struct Src { Src(IndexType o, IndexType s, const Scalar* d) : offset(o), stride(s), data(d) {} IndexType offset; IndexType stride; const Scalar* data; }; template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, const Src& src, const size_t count) { Run(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, src.data); } private: template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( const IndexType count, const IndexType dst_offset, const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data, const IndexType src_offset, const IndexType src_stride, const Scalar* EIGEN_RESTRICT src_data) { const Scalar* src = &src_data[src_offset]; Scalar* dst = &dst_data[dst_offset]; if (!Vectorizable) { for (Index i = 0; i < count; ++i) { dst[i * dst_stride] = src[i * src_stride]; } return; } const IndexType vectorized_size = count - PacketSize; IndexType i = 0; if (kind == StridedLinearBufferCopy::Kind::Linear) { // ******************************************************************** // // Linear copy from `src` to `dst`. const IndexType unrolled_size = count - 4 * PacketSize; eigen_assert(src_stride == 1 && dst_stride == 1); for (; i <= unrolled_size; i += 4 * PacketSize) { for (int j = 0; j < 4; ++j) { Packet p = ploadu(src + i + j * PacketSize); pstoreu(dst + i + j * PacketSize, p); } } for (; i <= vectorized_size; i += PacketSize) { Packet p = ploadu(src + i); pstoreu(dst + i, p); } for (; i < count; ++i) { dst[i] = src[i]; } // ******************************************************************** // } else if (kind == StridedLinearBufferCopy::Kind::Scatter) { // Scatter from `src` to `dst`. eigen_assert(src_stride == 1 && dst_stride != 1); for (; i <= vectorized_size; i += PacketSize) { Packet p = ploadu(src + i); pscatter(dst + i * dst_stride, p, dst_stride); } for (; i < count; ++i) { dst[i * dst_stride] = src[i]; } // ******************************************************************** // } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) { // Fill `dst` with value at `*src`. eigen_assert(src_stride == 0 && dst_stride == 1); const IndexType unrolled_size = count - 4 * PacketSize; Packet p = pload1(src); for (; i <= unrolled_size; i += 4 * PacketSize) { for (int j = 0; j < 4; ++j) { pstoreu(dst + i + j * PacketSize, p); } } for (; i <= vectorized_size; i += PacketSize) { pstoreu(dst + i, p); } for (; i < count; ++i) { dst[i] = *src; } // ******************************************************************** // } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) { // Scatter `*src` into `dst`. eigen_assert(src_stride == 0 && dst_stride != 1); Packet p = pload1(src); for (; i <= vectorized_size; i += PacketSize) { pscatter(dst + i * dst_stride, p, dst_stride); } for (; i < count; ++i) { dst[i * dst_stride] = *src; } // ******************************************************************** // } else if (kind == StridedLinearBufferCopy::Kind::Gather) { // Gather from `src` into `dst`. eigen_assert(dst_stride == 1); for (; i <= vectorized_size; i += PacketSize) { Packet p = pgather(src + i * src_stride, src_stride); pstoreu(dst + i, p); } for (; i < count; ++i) { dst[i] = src[i * src_stride]; } // ******************************************************************** // } else if (kind == StridedLinearBufferCopy::Kind::Random) { // Random. for (; i < count; ++i) { dst[i * dst_stride] = src[i * src_stride]; } } else { eigen_assert(false); } } }; // -------------------------------------------------------------------------- // // TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block. // It's possible to specify src->dst dimension mapping for the copy operation. // Dimensions of `dst` specify how many elements have to be copied, for the // `src` we need to know only stride to navigate through source memory buffer. template class TensorBlockIO { static const bool IsColMajor = (Layout == ColMajor); typedef StridedLinearBufferCopy LinCopy; public: typedef DSizes Dimensions; typedef DSizes DimensionsMap; struct Dst { Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, IndexType dst_offset = 0) : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} Dimensions dims; Dimensions strides; Scalar* data; IndexType offset; }; struct Src { Src(const Dimensions& src_strides, const Scalar* src, IndexType src_offset = 0) : strides(src_strides), data(src), offset(src_offset) {} Dimensions strides; const Scalar* data; IndexType offset; }; // Copies data to `dst` from `src`, using provided dimensions mapping: // // src_dimension_index = dst_to_src_dim_map[dst_dimension_index] // // Returns the number of copied elements. static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy( const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) { // Copy single scalar value from `src` to `dst`. if (NumDims == 0) { *(dst.data + dst.offset) = *(src.data + src.offset); return 1; } // Both `dst` and `src` must have contiguous innermost dimension. We also // accept the special case with stride '0', because it's used as a trick to // implement broadcasting. { int inner_dim = IsColMajor ? 0 : NumDims - 1; EIGEN_UNUSED_VARIABLE(inner_dim); eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0); eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0); } // Give a shorter name to `dst_to_src_dim_map`. const DimensionsMap& dim_map = dst_to_src_dim_map; // Do not squeeze reordered inner dimensions. int num_squeezable_dims = NumSqueezableInnerDims(dim_map); // NOTE: We find the innermost dimension (contiguous in memory) in the dst // block, and we write data linearly into that dimension, reading it from // the src. If dimensions are reordered, we might end up reading data from // the src with `stride != 1`. // // NOTE: Random-Read/Linear-Write can be up to ~2X faster than // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680 // Find the innermost dimension in the dst whose size is not 1. This is the // effective inner dim. int num_size_one_inner_dims = 0; for (int i = 0; i < num_squeezable_dims; ++i) { const int dst_dim = IsColMajor ? i : NumDims - i - 1; if (dst.dims[dst_dim] != 1) break; num_size_one_inner_dims++; } // If all dimensions are of size 1, just copy a scalar from `src` to `dst`. if (num_size_one_inner_dims == NumDims) { *(dst.data + dst.offset) = *(src.data + src.offset); return 1; } // Outermost dimension in the dst with `stride == 1` (contiguous in memory). const int dst_stride1_dim = IsColMajor ? num_size_one_inner_dims : NumDims - num_size_one_inner_dims - 1; // Dimension in the src that corresponds to the dst innermost dimension. const int src_dim_for_dst_stride1_dim = NumDims == 0 ? 1 : dim_map[dst_stride1_dim]; // Size of the innermost dimension (length of contiguous blocks of memory). IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim]; // Squeeze multiple inner dims into one if they are contiguous in `dst` and // `src` memory, so we can do less linear copy calls. for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) { const int dst_dim = IsColMajor ? i : NumDims - i - 1; const IndexType dst_stride = dst.strides[dst_dim]; const IndexType src_stride = src.strides[dim_map[dst_dim]]; if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) { dst_inner_dim_size *= dst.dims[dst_dim]; ++num_size_one_inner_dims; } else { break; } } // Setup strides to read data from `src` and write to `dst`. IndexType input_offset = src.offset; IndexType output_offset = dst.offset; IndexType input_stride = NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim]; IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim]; const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; array it; // Initialize block iterator state. Squeeze away any dimension of size 1. int idx = 0; // currently initialized iterator state index for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2; if (dst.dims[dst_dim] == 1) continue; it[idx].size = dst.dims[dst_dim]; it[idx].input_stride = src.strides[dim_map[dst_dim]]; it[idx].output_stride = dst.strides[dst_dim]; it[idx].input_span = it[idx].input_stride * (it[idx].size - 1); it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); idx++; } // Iterate copying data from src to dst. const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); #define COPY_INNER_DIM(KIND) \ IndexType num_copied = 0; \ for (num_copied = 0; num_copied < block_total_size; \ num_copied += dst_inner_dim_size) { \ LinCopy::template Run( \ typename LinCopy::Dst(output_offset, output_stride, dst.data), \ typename LinCopy::Src(input_offset, input_stride, src.data), \ dst_inner_dim_size); \ \ for (int j = 0; j < idx; ++j) { \ if (++it[j].count < it[j].size) { \ input_offset += it[j].input_stride; \ output_offset += it[j].output_stride; \ break; \ } \ it[j].count = 0; \ input_offset -= it[j].input_span; \ output_offset -= it[j].output_span; \ } \ } \ return num_copied; if (input_stride == 1 && output_stride == 1) { COPY_INNER_DIM(LinCopy::Kind::Linear); } else if (input_stride == 1 && output_stride != 1) { COPY_INNER_DIM(LinCopy::Kind::Scatter); } else if (input_stride == 0 && output_stride == 1) { COPY_INNER_DIM(LinCopy::Kind::FillLinear); } else if (input_stride == 0 && output_stride != 1) { COPY_INNER_DIM(LinCopy::Kind::FillScatter); } else if (output_stride == 1) { COPY_INNER_DIM(LinCopy::Kind::Gather); } else { COPY_INNER_DIM(LinCopy::Kind::Random); } #undef COPY_INNER_DIM } // Copy from `src` to `dst` with an identity src->dst dimension map. Returns // the number of copied elements. static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, const Src& src) { DimensionsMap dst_to_src_map; for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i; return Copy(dst, src, dst_to_src_map); } private: struct BlockIteratorState { BlockIteratorState() : size(0), count(0), input_stride(0), output_stride(0), input_span(0), output_span(0) {} IndexType size; IndexType count; IndexType input_stride; IndexType output_stride; IndexType input_span; IndexType output_span; }; // Compute how many inner dimensions it's allowed to squeeze when doing IO // between two tensor blocks. It's safe to squeeze inner dimensions, only // if they are not reordered. static int NumSqueezableInnerDims(const DimensionsMap& dim_map) { int num_squeezable_dims = 0; for (int i = 0; i < NumDims; ++i) { const int dim = IsColMajor ? i : NumDims - i - 1; if (dim_map[dim] != dim) break; num_squeezable_dims++; } return num_squeezable_dims; } }; // -------------------------------------------------------------------------- // // TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to // a Tensor block defined by `desc`, backed by a memory buffer at `target`. // // Currently there is no way to write from a Tensor expression to a block of // memory, if dimensions are reordered. If you need to do that, you should // materialize a Tensor block expression into a memory buffer, and then use // TensorBlockIO to copy data between two memory buffers with a custom // `target->src` dimension map (see definition above). // // Also currently the innermost dimension of `target` must have a stride '1' // (contiguous in memory). This restriction could be lifted with a `pscatter`, // but in practice it's never needed, and there is a similar TensorBlockIO // workaround for that. // // TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO // where `src` is a tensor expression. Explore if it is possible to rewrite IO // to use expressions instead of pointers, and after that TensorBlockAssignment // will become an alias to IO. template class TensorBlockAssignment { // We will use coeff/packet path to evaluate block expressions. typedef TensorEvaluator TensorBlockEvaluator; typedef DSizes Dimensions; enum { Vectorizable = packet_traits::Vectorizable, PacketSize = packet_traits::size }; template struct InnerDimAssign { EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) { for (IndexType i = 0; i < count; ++i) { target[i] = eval.coeff(eval_offset + i); } } }; template struct InnerDimAssign { EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) { typedef typename packet_traits::type Packet; const IndexType unrolled_size = count - 4 * PacketSize; const IndexType vectorized_size = count - PacketSize; IndexType i = 0; for (; i <= unrolled_size; i += 4 * PacketSize) { for (int j = 0; j < 4; ++j) { const IndexType idx = eval_offset + i + j * PacketSize; Packet p = eval.template packet(idx); pstoreu(target + i + j * PacketSize, p); } } for (; i <= vectorized_size; i += PacketSize) { Packet p = eval.template packet(eval_offset + i); pstoreu(target + i, p); } for (; i < count; ++i) { target[i] = eval.coeff(eval_offset + i); } } }; public: struct Target { Target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data, IndexType target_offset = 0) : dims(target_dims), strides(target_strides), data(target_data), offset(target_offset) {} Dimensions dims; Dimensions strides; Scalar* data; IndexType offset; }; static Target target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data, IndexType target_offset = 0) { return Target(target_dims, target_strides, target_data, target_offset); } template static Target target( const DSizes& target_dims, const DSizes& target_strides, Scalar* target_data, IndexType target_offset = 0) { // DSizes constructor will do index type promotion if it's safe. return Target(Dimensions(target_dims), Dimensions(target_strides), target_data, target_offset); } static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( const Target& target, const TensorBlockExpr& expr) { // Prepare evaluator for block expression. DefaultDevice default_device; TensorBlockEvaluator eval(expr, default_device); // Tensor block expression dimension should match destination dimensions. eigen_assert(dimensions_match(target.dims, eval.dimensions())); static const int Layout = TensorBlockEvaluator::Layout; static const bool is_col_major = Layout == ColMajor; // Initialize output inner dimension size based on a layout. const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize(); const int inner_dim_idx = is_col_major ? 0 : NumDims - 1; IndexType output_inner_dim_size = target.dims[inner_dim_idx]; // Target inner dimension stride must be '1'. eigen_assert(target.strides[inner_dim_idx] == 1); // Squeeze multiple inner dims into one if they are contiguous in `target`. IndexType num_squeezed_dims = 0; for (Index i = 1; i < NumDims; ++i) { const Index dim = is_col_major ? i : NumDims - i - 1; const IndexType target_stride = target.strides[dim]; if (output_inner_dim_size == target_stride) { output_inner_dim_size *= target.dims[dim]; num_squeezed_dims++; } else { break; } } // Initialize output block iterator state. Dimension in this array are // always in inner_most -> outer_most order (col major layout). array it; int idx = 0; // currently initialized iterator state index for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) { const Index dim = is_col_major ? i + 1 : NumDims - i - 2; it[idx].count = 0; it[idx].size = target.dims[dim]; it[idx].output_stride = target.strides[dim]; it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); idx++; } // We read block expression from the beginning, and start writing data to // `target` at given offset. IndexType input_offset = 0; IndexType output_offset = target.offset; // Iterate copying data from `eval` to `target`. for (IndexType i = 0; i < output_size; i += output_inner_dim_size) { // Assign to `target` at current offset. InnerDimAssign::Run(target.data + output_offset, output_inner_dim_size, eval, input_offset); // Move input offset forward by the number of assigned coefficients. input_offset += output_inner_dim_size; // Update index. for (int j = 0; j < idx; ++j) { if (++it[j].count < it[j].size) { output_offset += it[j].output_stride; break; } it[j].count = 0; output_offset -= it[j].output_span; } } } private: struct BlockIteratorState { BlockIteratorState() : count(0), size(0), output_stride(0), output_span(0) {} IndexType count; IndexType size; IndexType output_stride; IndexType output_span; }; }; // -------------------------------------------------------------------------- // } // namespace internal } // namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H