// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H namespace Eigen { namespace internal { // -------------------------------------------------------------------------- // // Forward declarations for templates defined below. template class TensorBlockIOV2; // -------------------------------------------------------------------------- // // Helper function to compute strides for densely stored buffer of given // dimensions. // TODO(ezhulenev): We compute strides 1000 times in different evaluators, use // this function instead everywhere. template EIGEN_ALWAYS_INLINE DSizes strides( const DSizes& dimensions) { DSizes strides; if (NumDims == 0) return strides; // TODO(ezhulenev): Use templates to unroll this loop (similar to // h_array_reduce in CXX11meta.h)? Benchmark it. if (static_cast(Layout) == static_cast(ColMajor)) { strides[0] = 1; for (int i = 1; i < NumDims; ++i) { strides[i] = strides[i - 1] * dimensions[i - 1]; } } else { strides[NumDims - 1] = 1; for (int i = NumDims - 2; i >= 0; --i) { strides[i] = strides[i + 1] * dimensions[i + 1]; } } return strides; } template EIGEN_ALWAYS_INLINE DSizes strides( const Eigen::array& dimensions) { return strides(DSizes(dimensions)); } #if EIGEN_HAS_CXX11 template EIGEN_STRONG_INLINE DSizes strides( const Sizes& sizes) { return strides(DSizes(sizes)); } #endif // -------------------------------------------------------------------------- // // TensorBlockDescriptor specifies a block offset within a tensor and the block // sizes along each of the tensor dimensions. template class TensorBlockDescriptor { public: typedef DSizes Dimensions; // If we evaluate a Tensor assignment, and expression on the left, already has // a memory buffer, then we might do performance optimization, and evaluate // the root expression directly into the memory, or maybe use it as temporary // storage for some of the subexpressions, to avoid dynamic memory allocation. // // This is a type erased storage, because passing Scalar type through all the // expression evaluation layers it way too many templates. Also it should be // possible to use this destination as a temp buffer for materializing // expressions with type, not matching the final output. class DestinationBuffer { public: template Scalar* data() const { return static_cast(m_data); } template Dimensions dimensions() const { Dimensions dimensions; for (int i = 0; i < NumDims; ++i) { eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0); dimensions[i] = m_dimensions[i] / sizeof(Scalar); } return dimensions; } template Dimensions strides() const { Dimensions strides; for (int i = 0; i < NumDims; ++i) { eigen_assert(m_strides[i] % sizeof(Scalar) == 0); strides[i] = m_strides[i] / sizeof(Scalar); } return strides; } // Returns true if the tensor block corresponding to `desc` fits into the // contiguous block of memory defined by `*this`. template bool fitsContiguously(const TensorBlockDescriptor& desc) const { if (m_data == NULL) return false; const Dimensions& desc_dims = desc.dimensions(); const Dimensions& dst_dims = dimensions(); if (!dimensions_match(desc_dims, dst_dims)) return false; const Dimensions& desc_strides = internal::strides(desc_dims); const Dimensions& dst_strides = strides(); // Compare strides ignoring dimensions of size `1`. for (int i = 0; i < NumDims; ++i) { if (desc_dims[i] == 1) continue; if (desc_strides[i] != dst_strides[i]) return false; } return true; } private: friend class TensorBlockDescriptor; DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {} template DestinationBuffer(Scalar* data, const Dimensions& dimensions, const Dimensions& strides, size_t total_dst_bytes) : m_data(static_cast(data)), m_dimensions(dimensions), m_strides(strides), m_total_dst_bytes(total_dst_bytes) { // TODO(ezhulenev): Benchmark template meta-unroll for this loop. for (int i = 0; i < NumDims; ++i) { m_dimensions[i] *= sizeof(Scalar); m_strides[i] *= sizeof(Scalar); } } void* m_data; Dimensions m_dimensions; Dimensions m_strides; // Total size of the memory buffer at the destination (typically the total // size of the left hand side of an assignment expression). This can be the // same as `array_prod(m_dimensions)` if the assignment target has just a // single block, but typically it's a larger number. size_t m_total_dst_bytes; }; TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, const DestinationBuffer& destination) : m_offset(offset), m_dimensions(dimensions), m_destination(destination) {} TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions) : m_offset(offset), m_dimensions(dimensions), m_destination(DestinationBuffer()) {} IndexType offset() const { return m_offset; } const Dimensions& dimensions() const { return m_dimensions; } IndexType dimension(int index) const { return m_dimensions[index]; } IndexType size() const { return array_prod(m_dimensions); } template void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides, size_t total_dst_bytes) { m_destination = DestinationBuffer(dst_base, m_dimensions, dst_strides, total_dst_bytes); } template void AddDestinationBuffer( Scalar* dst_base, const DSizes& dst_strides, size_t total_dst_bytes) { // DSizes constructor will do index type promotion if it's safe. AddDestinationBuffer(dst_base, Dimensions(dst_strides), total_dst_bytes); } TensorBlockDescriptor& DropDestinationBuffer() { m_destination.m_data = NULL; return *this; } bool HasDestinationBuffer() const { return m_destination.m_data != NULL; } const DestinationBuffer& GetDestinationBuffer() const { return m_destination; } // Returns a non-nullptr pointer to a destination buffer memory if this // block has a contiguous destination buffer. template Scalar* destination() const { if (m_destination.template fitsContiguously(*this)) { return m_destination.template data(); } return NULL; } // Returns a copy of `*this` with updated offset. TensorBlockDescriptor WithOffset(IndexType offset) const { return TensorBlockDescriptor(offset, m_dimensions, m_destination); } private: // Offset and dimensions are immutable after construction. Block descriptor // can only be mutated by adding or dropping destination. const IndexType m_offset; const Dimensions m_dimensions; DestinationBuffer m_destination; }; // -------------------------------------------------------------------------- // // TensorBlockScratchAllocator is responsible for allocating temporary buffers // for block evaluation (output or input block materialization). Given that // Eigen expression traversal order is deterministic, all temporary allocations // are happening in the same order, and usually have exactly the same size. // Scratch allocator keeps a trace of all dynamic allocations, and after the // first block evaluation is completed, we should be able to reuse all the // temporary buffers for the next block evaluation. template class TensorBlockScratchAllocator { public: explicit TensorBlockScratchAllocator(const Device& device) : m_device(device), m_allocation_index(0) {} ~TensorBlockScratchAllocator() { for (size_t i = 0; i < m_allocations.size(); ++i) { m_device.deallocate(m_allocations[i].ptr); } } void* allocate(size_t size) { // TODO(ezhulenev): Remove when replaced with inlined vector. if (m_allocations.capacity() == 0) m_allocations.reserve(8); // Check if we already have an existing allocation att current index. const int num_allocations = static_cast(m_allocations.size()); const bool has_allocation = m_allocation_index < num_allocations; // Allocation index can't be larger than the number of allocations. eigen_assert(m_allocation_index <= num_allocations); // If we have existing allocation, and its size is larger or equal to // requested size, we do nothing. // If current allocation can't fit requested size, we deallocate it, and // replace with a larger allocation. if (has_allocation && m_allocations[m_allocation_index].size < size) { m_device.deallocate(m_allocations[m_allocation_index].ptr); m_allocations[m_allocation_index].ptr = m_device.allocate(size); m_allocations[m_allocation_index].size = size; } // Make a new allocation if we don't have and existing one. if (!has_allocation) { Allocation allocation; allocation.ptr = m_device.allocate(size); allocation.size = size; m_allocations.push_back(allocation); } eigen_assert(m_allocations[m_allocation_index].ptr != NULL); eigen_assert(m_allocations[m_allocation_index].size >= size); return m_allocations[m_allocation_index++].ptr; } void reset() { m_allocation_index = 0; } private: struct Allocation { void* ptr; size_t size; }; const Device& m_device; int m_allocation_index; // TODO(ezhulenev): This should be an inlined vector. std::vector m_allocations; }; // -------------------------------------------------------------------------- // // TensorBlockKind represents all possible block kinds, that can be produced by // TensorEvaluator::evalBlock function. #if !EIGEN_HAS_CXX11 // To be able to use `TensorBlockKind::kExpr` in C++03 we need a namespace. // (Use of enumeration in a nested name specifier is a c++11 extension). namespace TensorBlockKind { #endif enum TensorBlockKind { // Tensor block that is a lazy expression that must be assigned to a // destination using TensorBlockAssign. kExpr, // Tensor block that is a view into a memory buffer owned by an underlying // Tensor expression (e.g. it can be a view into a Tensor buffer). kView, // Tensor block that was materialized in a scratch memory buffer, allocated // with TensorBlockScratchAllocator. This block must be copied to a // destination, similar to a block of `kExpr` type. kMaterializedInScratch, // Tensor block that was materialized directly into the final output memory // buffer. For example if the left side of an assignment is a Tensor, we can // directly materialize the block in the destination memory. // // If strides in the output buffer do not match tensor block strides, the // Tensor expression will be invalid, and should not be used by // TensorBlockAssign or for constructing another block expression. kMaterializedInOutput }; #if !EIGEN_HAS_CXX11 } // namespace TensorBlockKind #endif // -------------------------------------------------------------------------- // // TensorBlockNotImplemented should be used to defined TensorBlock typedef in // TensorEvaluators that do not support block evaluation. class TensorBlockNotImplemented { public: typedef void XprType; }; // -------------------------------------------------------------------------- // // XprScalar extracts Scalar type from the Eigen expressions (if expression type // is not void). It's required to be able to define lazy block expression for // argument types, that do not support block evaluation. template struct XprScalar { typedef typename XprType::Scalar type; }; template <> struct XprScalar { typedef void type; }; // -------------------------------------------------------------------------- // // TensorMaterializedBlock is a fully evaluated block of the original tensor, // and XprType is just a TensorMap over the data. This block type is typically // used to materialize blocks of tensor expressions, that can't be efficiently // represented as lazy Tensor expressions with fast coeff/packet operations, // e.g. we materialize all broadcasts into evaluated blocks. // // TensorMaterializedBlock does not own its memory buffer, it's either a memory // buffer that backs the original expression (e.g. block is just a view into a // Tensor), or a memory buffer allocated with scratch allocator, and in this // case the scratch allocator will deallocate it at the end of block based // expression execution. // // If the block was evaluated directly into the output buffer, and strides in // the output buffer do not match block strides, the TensorMap expression will // be invalid, and should never be used in block assignment or any other tensor // expression. template class TensorMaterializedBlock { #if !EIGEN_HAS_CXX11 typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; #endif public: typedef DSizes Dimensions; typedef TensorMap > XprType; TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, const Dimensions& dimensions, bool valid_expr = true) : m_kind(kind), m_data(data), m_dimensions(dimensions), m_expr(m_data, m_dimensions), m_valid_expr(valid_expr) { eigen_assert(m_kind == internal::TensorBlockKind::kView || m_kind == internal::TensorBlockKind::kMaterializedInScratch || m_kind == internal::TensorBlockKind::kMaterializedInOutput); } TensorBlockKind kind() const { return m_kind; } // NOTE(ezhulenev): Returning XprType by value like in other block types // causes asan failures. The theory is that XprType::Nested doesn't work // properly for TensorMap. const XprType& expr() const { eigen_assert(m_valid_expr); return m_expr; } const Scalar* data() const { return m_data; } void cleanup() {} typedef internal::TensorBlockDescriptor TensorBlockDesc; // Creates a materialized block for the given descriptor from a memory buffer. template EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize( const Scalar* data, const DataDimensions& data_dims, TensorBlockDesc& desc, TensorBlockScratch& scratch) { eigen_assert(array_size::value == desc.dimensions().size()); // If a tensor block dimensions covers a contiguous block of the underlying // memory, we can skip block buffer memory allocation, and construct a block // from existing `data` memory buffer. // // Example: (RowMajor layout) // data_dims: [11, 12, 13, 14] // desc.dimensions(): [1, 1, 3, 14] // // In this case we can construct a TensorBlock starting at // `data + desc.offset()`, with a `desc.dimensions()` block sizes. static const bool is_col_major = Layout == ColMajor; // Find out how many inner dimensions have a matching size. int num_matching_inner_dims = 0; for (int i = 0; i < NumDims; ++i) { int dim = is_col_major ? i : NumDims - i - 1; if (data_dims[dim] != desc.dimensions()[dim]) break; ++num_matching_inner_dims; } // All the outer dimensions must be of size `1`, except a single dimension // before the matching inner dimension (`3` in the example above). bool can_use_direct_access = true; for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) { int dim = is_col_major ? i : NumDims - i - 1; if (desc.dimension(dim) != 1) { can_use_direct_access = false; break; } } if (can_use_direct_access) { const Scalar* block_start = data + desc.offset(); return TensorMaterializedBlock(internal::TensorBlockKind::kView, block_start, desc.dimensions()); } else { // Try to reuse destination as an output block buffer. Scalar* block_buffer = desc.template destination(); bool materialized_in_output; if (block_buffer != NULL) { desc.DropDestinationBuffer(); materialized_in_output = true; } else { materialized_in_output = false; void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); block_buffer = static_cast(mem); } typedef internal::TensorBlockIOV2 TensorBlockIO; typedef typename TensorBlockIO::Dst TensorBlockIODst; typedef typename TensorBlockIO::Src TensorBlockIOSrc; TensorBlockIOSrc src(internal::strides(Dimensions(data_dims)), data, desc.offset()); TensorBlockIODst dst(desc.dimensions(), internal::strides(desc.dimensions()), block_buffer); TensorBlockIO::Copy(dst, src); return TensorMaterializedBlock( materialized_in_output ? internal::TensorBlockKind::kMaterializedInOutput : internal::TensorBlockKind::kMaterializedInScratch, block_buffer, desc.dimensions()); } } private: TensorBlockKind m_kind; const Scalar* m_data; Dimensions m_dimensions; XprType m_expr; bool m_valid_expr; }; // -------------------------------------------------------------------------- // // TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp // functor to the blocks produced by the underlying Tensor expression. template class TensorCwiseUnaryBlock { #if !EIGEN_HAS_CXX11 typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; #endif static const bool NoArgBlockAccess = internal::is_void::value; public: typedef typename conditional< NoArgBlockAccess, void, TensorCwiseUnaryOp >:: type XprType; typedef typename XprScalar::type Scalar; TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor) : m_arg_block(arg_block), m_functor(functor) {} TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } XprType expr() const { return XprType(m_arg_block.expr(), m_functor); } const Scalar* data() const { return NULL; } void cleanup() { m_arg_block.cleanup(); } private: ArgTensorBlock m_arg_block; UnaryOp m_functor; }; // -------------------------------------------------------------------------- // // TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp // functor to the blocks produced by the underlying Tensor expression. template class TensorCwiseBinaryBlock { #if !EIGEN_HAS_CXX11 typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; #endif static const bool NoArgBlockAccess = internal::is_void::value || internal::is_void::value; public: typedef typename conditional< NoArgBlockAccess, void, TensorCwiseBinaryOp >::type XprType; typedef typename XprScalar::type Scalar; TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, const RhsTensorBlock& right_block, const BinaryOp& functor) : m_left_block(left_block), m_right_block(right_block), m_functor(functor) {} TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } XprType expr() const { return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); } const Scalar* data() const { return NULL; } void cleanup() { m_left_block.cleanup(); m_right_block.cleanup(); } private: LhsTensorBlock m_left_block; RhsTensorBlock m_right_block; BinaryOp m_functor; }; // -------------------------------------------------------------------------- // // TensorUnaryExprBlock is a lazy tensor expression block that can construct // an arbitrary tensor expression from a block of the underlying type (this is a // generalization of the TensorCwiseUnaryBlock for arbitrary expressions). template class TensorUnaryExprBlock { #if !EIGEN_HAS_CXX11 typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; #endif typedef typename ArgTensorBlock::XprType ArgXprType; static const bool NoArgBlockAccess = internal::is_void::value; public: typedef typename conditional< NoArgBlockAccess, void, typename BlockFactory::template XprType::type>::type XprType; typedef typename XprScalar::type Scalar; TensorUnaryExprBlock(const ArgTensorBlock& arg_block, const BlockFactory& factory) : m_arg_block(arg_block), m_factory(factory) {} TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } XprType expr() const { return m_factory.expr(m_arg_block.expr()); } const Scalar* data() const { return NULL; } void cleanup() { m_arg_block.cleanup(); } private: ArgTensorBlock m_arg_block; BlockFactory m_factory; }; // -------------------------------------------------------------------------- // // TensorTernaryExprBlock is a lazy tensor expression block that can construct // an arbitrary tensor expression from three blocks of the underlying type. template class TensorTernaryExprBlock { #if !EIGEN_HAS_CXX11 typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; #endif typedef typename Arg1TensorBlock::XprType Arg1XprType; typedef typename Arg2TensorBlock::XprType Arg2XprType; typedef typename Arg3TensorBlock::XprType Arg3XprType; static const bool NoArgBlockAccess = internal::is_void::value || internal::is_void::value || internal::is_void::value; public: typedef typename conditional< NoArgBlockAccess, void, typename BlockFactory::template XprType::type>::type XprType; typedef typename XprScalar::type Scalar; TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, const Arg2TensorBlock& arg2_block, const Arg3TensorBlock& arg3_block, const BlockFactory& factory) : m_arg1_block(arg1_block), m_arg2_block(arg2_block), m_arg3_block(arg3_block), m_factory(factory) {} TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } XprType expr() const { return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), m_arg3_block.expr()); } const Scalar* data() const { return NULL; } void cleanup() { m_arg1_block.cleanup(); m_arg2_block.cleanup(); m_arg3_block.cleanup(); } private: Arg1TensorBlock m_arg1_block; Arg2TensorBlock m_arg2_block; Arg3TensorBlock m_arg3_block; BlockFactory m_factory; }; // -------------------------------------------------------------------------- // // StridedLinearBufferCopy provides a method to copy data between two linear // buffers with different strides, with optimized paths for scatter/gather. template class StridedLinearBufferCopy { typedef typename packet_traits::type Packet; enum { Vectorizable = packet_traits::Vectorizable, PacketSize = packet_traits::size }; public: // Specifying linear copy kind statically gives ~30% speedup for small sizes. enum Kind { Linear = 0, // src_stride == 1 && dst_stride == 1 Scatter = 1, // src_stride == 1 && dst_stride != 1 FillLinear = 2, // src_stride == 0 && dst_stride == 1 FillScatter = 3, // src_stride == 0 && dst_stride != 1 Gather = 4, // dst_stride == 1 Random = 5 // everything else }; struct Dst { Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {} IndexType offset; IndexType stride; Scalar* data; }; struct Src { Src(IndexType o, IndexType s, const Scalar* d) : offset(o), stride(s), data(d) {} IndexType offset; IndexType stride; const Scalar* data; }; template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, const Src& src, const size_t count) { Run(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, src.data); } private: template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( const IndexType count, const IndexType dst_offset, const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data, const IndexType src_offset, const IndexType src_stride, const Scalar* EIGEN_RESTRICT src_data) { const Scalar* src = &src_data[src_offset]; Scalar* dst = &dst_data[dst_offset]; if (!Vectorizable) { for (Index i = 0; i < count; ++i) { dst[i * dst_stride] = src[i * src_stride]; } return; } const IndexType vectorized_size = count - PacketSize; IndexType i = 0; if (kind == Linear) { // ******************************************************************** // // Linear copy from `src` to `dst`. const IndexType unrolled_size = count - 4 * PacketSize; eigen_assert(src_stride == 1 && dst_stride == 1); for (; i <= unrolled_size; i += 4 * PacketSize) { for (int j = 0; j < 4; ++j) { Packet p = ploadu(src + i + j * PacketSize); pstoreu(dst + i + j * PacketSize, p); } } for (; i <= vectorized_size; i += PacketSize) { Packet p = ploadu(src + i); pstoreu(dst + i, p); } for (; i < count; ++i) { dst[i] = src[i]; } // ******************************************************************** // } else if (kind == Scatter) { // Scatter from `src` to `dst`. eigen_assert(src_stride == 1 && dst_stride != 1); for (; i <= vectorized_size; i += PacketSize) { Packet p = ploadu(src + i); pscatter(dst + i * dst_stride, p, dst_stride); } for (; i < count; ++i) { dst[i * dst_stride] = src[i]; } // ******************************************************************** // } else if (kind == FillLinear) { // Fill `dst` with value at `*src`. eigen_assert(src_stride == 0 && dst_stride == 1); const IndexType unrolled_size = count - 4 * PacketSize; Packet p = pload1(src); for (; i <= unrolled_size; i += 4 * PacketSize) { for (int j = 0; j < 4; ++j) { pstoreu(dst + i + j * PacketSize, p); } } for (; i <= vectorized_size; i += PacketSize) { pstoreu(dst + i, p); } for (; i < count; ++i) { dst[i] = *src; } // ******************************************************************** // } else if (kind == FillScatter) { // Scatter `*src` into `dst`. eigen_assert(src_stride == 0 && dst_stride != 1); Packet p = pload1(src); for (; i <= vectorized_size; i += PacketSize) { pscatter(dst + i * dst_stride, p, dst_stride); } for (; i < count; ++i) { dst[i * dst_stride] = *src; } // ******************************************************************** // } else if (kind == Gather) { // Gather from `src` into `dst`. eigen_assert(dst_stride == 1); for (; i <= vectorized_size; i += PacketSize) { Packet p = pgather(src + i * src_stride, src_stride); pstoreu(dst + i, p); } for (; i < count; ++i) { dst[i] = src[i * src_stride]; } // ******************************************************************** // } else if (kind == Random) { // Random. for (; i < count; ++i) { dst[i * dst_stride] = src[i * src_stride]; } } else { eigen_assert(false); } } }; // -------------------------------------------------------------------------- // // TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block. // It's possible to specify src->dst dimension mapping for the copy operation. // Dimensions of `dst` specify how many elements have to be copied, for the // `src` we need to know only stride to navigate through source memory buffer. template class TensorBlockIOV2 { static const bool IsColMajor = (Layout == ColMajor); typedef StridedLinearBufferCopy LinCopy; public: typedef DSizes Dimensions; typedef DSizes DimensionsMap; struct Dst { Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, IndexType dst_offset = 0) : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} Dimensions dims; Dimensions strides; Scalar* data; IndexType offset; }; struct Src { Src(const Dimensions& src_strides, const Scalar* src, IndexType src_offset = 0) : strides(src_strides), data(src), offset(src_offset) {} Dimensions strides; const Scalar* data; IndexType offset; }; // Copies data to `dst` from `src`, using provided dimensions mapping: // // src_dimension_index = dst_to_src_dim_map[dst_dimension_index] // // Returns the number of copied elements. static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy( const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) { // Copy single scalar value from `src` to `dst`. if (NumDims == 0) { *(dst.data + dst.offset) = *(src.data + src.offset); return 1; } // Both `dst` and `src` must have contiguous innermost dimension. We also // accept the special case with stride '0', because it's used as a trick to // implement broadcasting. { int inner_dim = IsColMajor ? 0 : NumDims - 1; EIGEN_UNUSED_VARIABLE(inner_dim); eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0); eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0); } // Give a shorter name to `dst_to_src_dim_map`. const DimensionsMap& dim_map = dst_to_src_dim_map; // Do not squeeze reordered inner dimensions. int num_squeezable_dims = NumSqueezableInnerDims(dim_map); // NOTE: We find the innermost dimension (contiguous in memory) in the dst // block, and we write data linearly into that dimension, reading it from // the src. If dimensions are reordered, we might end up reading data from // the src with `stride != 1`. // // NOTE: Random-Read/Linear-Write can be up to ~2X faster than // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680 // Find the innermost dimension in the dst whose size is not 1. This is the // effective inner dim. int num_size_one_inner_dims = 0; for (int i = 0; i < num_squeezable_dims; ++i) { const int dst_dim = IsColMajor ? i : NumDims - i - 1; if (dst.dims[dst_dim] != 1) break; num_size_one_inner_dims++; } // If all dimensions are of size 1, just copy a scalar from `src` to `dst`. if (num_size_one_inner_dims == NumDims) { *(dst.data + dst.offset) = *(src.data + src.offset); return 1; } // Outermost dimension in the dst with `stride == 1` (contiguous in memory). const int dst_stride1_dim = IsColMajor ? num_size_one_inner_dims : NumDims - num_size_one_inner_dims - 1; // Dimension in the src that corresponds to the dst innermost dimension. const int src_dim_for_dst_stride1_dim = NumDims == 0 ? 1 : dim_map[dst_stride1_dim]; // Size of the innermost dimension (length of contiguous blocks of memory). IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim]; // Squeeze multiple inner dims into one if they are contiguous in `dst` and // `src` memory, so we can do less linear copy calls. for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) { const int dst_dim = IsColMajor ? i : NumDims - i - 1; const IndexType dst_stride = dst.strides[dst_dim]; const IndexType src_stride = src.strides[dim_map[dst_dim]]; if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) { dst_inner_dim_size *= dst.dims[dst_dim]; ++num_size_one_inner_dims; } else { break; } } // Setup strides to read data from `src` and write to `dst`. IndexType input_offset = src.offset; IndexType output_offset = dst.offset; IndexType input_stride = NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim]; IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim]; const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; array it; // Initialize block iterator state. Squeeze away any dimension of size 1. int idx = 0; // currently initialized iterator state index for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2; if (dst.dims[dst_dim] == 1) continue; it[idx].size = dst.dims[dst_dim]; it[idx].input_stride = src.strides[dim_map[dst_dim]]; it[idx].output_stride = dst.strides[dst_dim]; it[idx].input_span = it[idx].input_stride * (it[idx].size - 1); it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); idx++; } // Iterate copying data from src to dst. const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); #define COPY_INNER_DIM(KIND) \ IndexType num_copied = 0; \ for (num_copied = 0; num_copied < block_total_size; \ num_copied += dst_inner_dim_size) { \ LinCopy::template Run( \ typename LinCopy::Dst(output_offset, output_stride, dst.data), \ typename LinCopy::Src(input_offset, input_stride, src.data), \ dst_inner_dim_size); \ \ for (int j = 0; j < idx; ++j) { \ if (++it[j].count < it[j].size) { \ input_offset += it[j].input_stride; \ output_offset += it[j].output_stride; \ break; \ } \ it[j].count = 0; \ input_offset -= it[j].input_span; \ output_offset -= it[j].output_span; \ } \ } \ return num_copied; if (input_stride == 1 && output_stride == 1) { COPY_INNER_DIM(LinCopy::Linear); } else if (input_stride == 1 && output_stride != 1) { COPY_INNER_DIM(LinCopy::Scatter); } else if (input_stride == 0 && output_stride == 1) { COPY_INNER_DIM(LinCopy::FillLinear); } else if (input_stride == 0 && output_stride != 1) { COPY_INNER_DIM(LinCopy::FillScatter); } else if (output_stride == 1) { COPY_INNER_DIM(LinCopy::Gather); } else { COPY_INNER_DIM(LinCopy::Random); } #undef COPY_INNER_DIM } // Copy from `src` to `dst` with an identity src->dst dimension map. Returns // the number of copied elements. static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, const Src& src) { DimensionsMap dst_to_src_map; for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i; return Copy(dst, src, dst_to_src_map); } private: struct BlockIteratorState { BlockIteratorState() : size(0), count(0), input_stride(0), output_stride(0), input_span(0), output_span(0) {} IndexType size; IndexType count; IndexType input_stride; IndexType output_stride; IndexType input_span; IndexType output_span; }; // Compute how many inner dimensions it's allowed to squeeze when doing IO // between two tensor blocks. It's safe to squeeze inner dimensions, only // if they are not reordered. static int NumSqueezableInnerDims(const DimensionsMap& dim_map) { int num_squeezable_dims = 0; for (int i = 0; i < NumDims; ++i) { const int dim = IsColMajor ? i : NumDims - i - 1; if (dim_map[dim] != dim) break; num_squeezable_dims++; } return num_squeezable_dims; } }; // -------------------------------------------------------------------------- // // TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to // a Tensor block defined by `desc`, backed by a memory buffer at `target`. // // Currently there is no way to write from a Tensor expression to a block of // memory, if dimensions are reordered. If you need to do that, you should // materialize a Tensor block expression into a memory buffer, and then use // TensorBlockIO to copy data between two memory buffers with a custom // `target->src` dimension map (see definition above). // // Also currently the innermost dimension of `target` must have a stride '1' // (contiguous in memory). This restriction could be lifted with a `pscatter`, // but in practice it's never needed, and there is a similar TensorBlockIO // workaround for that. // // TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO // where `src` is a tensor expression. Explore if it is possible to rewrite IO // to use expressions instead of pointers, and after that TensorBlockAssignment // will become an alias to IO. template class TensorBlockAssignment { // We will use coeff/packet path to evaluate block expressions. typedef TensorEvaluator TensorBlockEvaluator; typedef DSizes Dimensions; enum { Vectorizable = packet_traits::Vectorizable, PacketSize = packet_traits::size }; template struct InnerDimAssign { EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) { for (IndexType i = 0; i < count; ++i) { target[i] = eval.coeff(eval_offset + i); } } }; template struct InnerDimAssign { EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) { typedef typename packet_traits::type Packet; const IndexType unrolled_size = count - 4 * PacketSize; const IndexType vectorized_size = count - PacketSize; IndexType i = 0; for (; i <= unrolled_size; i += 4 * PacketSize) { for (int j = 0; j < 4; ++j) { const IndexType idx = eval_offset + i + j * PacketSize; Packet p = eval.template packet(idx); pstoreu(target + i + j * PacketSize, p); } } for (; i <= vectorized_size; i += PacketSize) { Packet p = eval.template packet(eval_offset + i); pstoreu(target + i, p); } for (; i < count; ++i) { target[i] = eval.coeff(eval_offset + i); } } }; public: struct Target { Target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data, IndexType target_offset = 0) : dims(target_dims), strides(target_strides), data(target_data), offset(target_offset) {} Dimensions dims; Dimensions strides; Scalar* data; IndexType offset; }; static Target target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data, IndexType target_offset = 0) { return Target(target_dims, target_strides, target_data, target_offset); } template static Target target( const DSizes& target_dims, const DSizes& target_strides, Scalar* target_data, IndexType target_offset = 0) { // DSizes constructor will do index type promotion if it's safe. return Target(Dimensions(target_dims), Dimensions(target_strides), target_data, target_offset); } static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( const Target& target, const TensorBlockExpr& expr) { // Prepare evaluator for block expression. DefaultDevice default_device; TensorBlockEvaluator eval(expr, default_device); // Tensor block expression dimension should match destination dimensions. eigen_assert(dimensions_match(target.dims, eval.dimensions())); static const int Layout = TensorBlockEvaluator::Layout; static const bool is_col_major = Layout == ColMajor; // Initialize output inner dimension size based on a layout. const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize(); const int inner_dim_idx = is_col_major ? 0 : NumDims - 1; IndexType output_inner_dim_size = target.dims[inner_dim_idx]; // Target inner dimension stride must be '1'. eigen_assert(target.strides[inner_dim_idx] == 1); // Squeeze multiple inner dims into one if they are contiguous in `target`. IndexType num_squeezed_dims = 0; for (Index i = 1; i < NumDims; ++i) { const Index dim = is_col_major ? i : NumDims - i - 1; const IndexType target_stride = target.strides[dim]; if (output_inner_dim_size == target_stride) { output_inner_dim_size *= target.dims[dim]; num_squeezed_dims++; } else { break; } } // Initialize output block iterator state. Dimension in this array are // always in inner_most -> outer_most order (col major layout). array it; int idx = 0; // currently initialized iterator state index for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) { const Index dim = is_col_major ? i + 1 : NumDims - i - 2; it[idx].count = 0; it[idx].size = target.dims[dim]; it[idx].output_stride = target.strides[dim]; it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); idx++; } // We read block expression from the beginning, and start writing data to // `target` at given offset. IndexType input_offset = 0; IndexType output_offset = target.offset; // Iterate copying data from `eval` to `target`. for (IndexType i = 0; i < output_size; i += output_inner_dim_size) { // Assign to `target` at current offset. InnerDimAssign::Run(target.data + output_offset, output_inner_dim_size, eval, input_offset); // Move input offset forward by the number of assigned coefficients. input_offset += output_inner_dim_size; // Update index. for (int j = 0; j < idx; ++j) { if (++it[j].count < it[j].size) { output_offset += it[j].output_stride; break; } it[j].count = 0; output_offset -= it[j].output_span; } } } private: struct BlockIteratorState { BlockIteratorState() : count(0), size(0), output_stride(0), output_span(0) {} IndexType count; IndexType size; IndexType output_stride; IndexType output_span; }; }; // -------------------------------------------------------------------------- // } // namespace internal } // namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H