aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
diff options
context:
space:
mode:
authorGravatar Eugene Zhulenev <ezhulenev@google.com>2019-12-10 15:40:23 -0800
committerGravatar Eugene Zhulenev <ezhulenev@google.com>2019-12-10 15:40:23 -0800
commit1c879eb010df8e53e5ac016ee5d155db2c721c2b (patch)
tree2e2d9830a59f2a61ca8c13a1c5b77b362870fb34 /unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
parentdbca11e8805ec07660d8f966a1884ad0be302f15 (diff)
Remove V2 suffix from TensorBlock
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h1481
1 files changed, 1481 insertions, 0 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
new file mode 100644
index 000000000..222333847
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -0,0 +1,1481 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+
+namespace Eigen {
+namespace internal {
+
+// -------------------------------------------------------------------------- //
+// Forward declarations for templates defined below.
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIO;
+
+// -------------------------------------------------------------------------- //
+// Helper function to compute strides for densely stored buffer of given
+// dimensions.
+
+// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
+// this function instead everywhere.
+template <int Layout, typename IndexType, int NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
+ const DSizes<IndexType, NumDims>& dimensions) {
+ DSizes<IndexType, NumDims> strides;
+ if (NumDims == 0) return strides;
+
+ // TODO(ezhulenev): Use templates to unroll this loop (similar to
+ // h_array_reduce in CXX11meta.h)? Benchmark it.
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ strides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ strides[i] = strides[i - 1] * dimensions[i - 1];
+ }
+ } else {
+ strides[NumDims - 1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ strides[i] = strides[i + 1] * dimensions[i + 1];
+ }
+ }
+
+ return strides;
+}
+
+template <int Layout, typename IndexType, size_t NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
+ const Eigen::array<IndexType, NumDims>& dimensions) {
+ return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
+}
+
+template <int Layout, std::ptrdiff_t... Indices>
+EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
+ const Sizes<Indices...>& sizes) {
+ return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
+}
+
+// -------------------------------------------------------------------------- //
+
+// Tensor block shape type defines what are the shape preference for the blocks
+// extracted from the larger tensor.
+//
+// Example: blocks of 100 elements from the large 100x100 tensor:
+// - tensor: 100x100
+// - target_block_size: 100
+//
+// TensorBlockShapeType:
+// - kUniformAllDims: 100 blocks of size 10x10
+// - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
+// or row major layout)
+enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
+
+struct TensorBlockResourceRequirements {
+ TensorBlockShapeType shape_type;
+ size_t size;
+
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
+ merge(const TensorBlockResourceRequirements &lhs,
+ const TensorBlockResourceRequirements &rhs) {
+ return {merge(lhs.shape_type, rhs.shape_type), merge(rhs.size, lhs.size)};
+ }
+
+ // This is a resource requirement that should be returned from expressions
+ // that do not have any block evaluation preference (e.g. default tensor
+ // expression with raw buffer access).
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
+ return {TensorBlockShapeType::kUniformAllDims, 1};
+ }
+
+private:
+ using Requirements = TensorBlockResourceRequirements;
+
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
+ return numext::maxi(lhs_size, rhs_size);
+ }
+
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs,
+ TensorBlockShapeType rhs) {
+ return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
+ rhs == TensorBlockShapeType::kSkewedInnerDims)
+ ? TensorBlockShapeType::kSkewedInnerDims
+ : TensorBlockShapeType::kUniformAllDims;
+ }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockDescriptor specifies a block offset within a tensor and the block
+// sizes along each of the tensor dimensions.
+
+template <int NumDims, typename IndexType = Eigen::Index>
+class TensorBlockDescriptor {
+ public:
+ typedef DSizes<IndexType, NumDims> Dimensions;
+
+ // If we evaluate a Tensor assignment, and expression on the left, already has
+ // a memory buffer, then we might do performance optimization, and evaluate
+ // the root expression directly into the final output memory. Some time it's
+ // possible to reuse it for materializing subexpressions inside an expression
+ // tree, to to avoid dynamic memory allocation.
+ //
+ // The pointer type of the underlying storage is erased, because passing
+ // Scalar type through all the expression evaluation layers is way too many
+ // templates. In practice destination buffer type should always match the
+ // evaluated expression scalar type.
+ class DestinationBuffer {
+ public:
+ enum DestinationBufferKind : int {
+ // The above explicit specification of "int" as the enum basetype is needed
+ // to get around a HIPCC link error ("the field type is not amp-compatible")
+ // which is issued for class members with the enum type.
+ // TODO(rocm):
+ // remove the "int" basetype once HIPCC has been fixed to not error out
+ // in the above scenario.
+
+ // Destination buffer is not defined (`m_data` == nullptr).
+ kEmpty,
+
+ // Tensor block defined by an owning tensor block descriptor can fit
+ // contiguously into the destination buffer. In this case it's safe to
+ // materialize tensor block in the destination buffer, wrap it in a
+ // TensorMap, and use to build Eigen expression on top of it.
+ kContiguous,
+
+ // Destination buffer strides do not match strides of the contiguously
+ // stored block, and it's impossible to define a TensorMap over this
+ // buffer. However if we are evaluating a root of an expression tree, we
+ // still can materialize an output into this destination, because we can
+ // guarantee that no one will ever access it through block API.
+ //
+ // In theory it is possible to build valid TensorStriding<TensorMap>
+ // expression on top of this destination buffer, however it has
+ // inefficient coeff/packet access, and defeats the purpose of fast block
+ // evaluation API.
+ kStrided
+ };
+
+ template <typename Scalar>
+ Scalar* data() const {
+ eigen_assert(m_data_type_size == sizeof(Scalar));
+ return static_cast<Scalar*>(m_data);
+ }
+
+ const Dimensions& strides() const { return m_strides; }
+ const DestinationBufferKind& kind() const { return m_kind; }
+
+ private:
+ friend class TensorBlockDescriptor;
+
+ DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
+
+ template <typename Scalar>
+ DestinationBuffer(Scalar* data, const Dimensions& strides,
+ DestinationBufferKind kind)
+ : m_data(static_cast<void*>(data)),
+ m_data_type_size(sizeof(Scalar)),
+ m_strides(strides),
+ m_kind(kind) {}
+
+ template <int Layout, typename Scalar>
+ static DestinationBuffer make(const TensorBlockDescriptor& desc,
+ Scalar* data, const Dimensions& strides) {
+ return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
+ }
+
+ template <int Layout>
+ static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
+ const Dimensions& strides) {
+ const Dimensions& desc_dims = desc.dimensions();
+ const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
+ for (int i = 0; i < NumDims; ++i) {
+ if (desc_dims[i] == 1) continue;
+ if (desc_strides[i] != strides[i]) return kStrided;
+ }
+ return kContiguous;
+ }
+
+ // Storage pointer is type erased, to reduce template bloat, but we still
+ // keep the size of the underlying element type for error checking.
+ void* m_data;
+ size_t m_data_type_size;
+
+ // Destination buffer dimensions always match the dimensions of a tensor
+ // block descriptor it belongs to, however strides might be different.
+ Dimensions m_strides;
+
+ DestinationBufferKind m_kind;
+ };
+
+ TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
+ const DestinationBuffer& destination)
+ : m_offset(offset),
+ m_dimensions(dimensions),
+ m_destination(destination) {}
+
+ TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
+ : m_offset(offset),
+ m_dimensions(dimensions),
+ m_destination(DestinationBuffer()) {}
+
+ IndexType offset() const { return m_offset; }
+ const Dimensions& dimensions() const { return m_dimensions; }
+ IndexType dimension(int index) const { return m_dimensions[index]; }
+ IndexType size() const { return array_prod<IndexType>(m_dimensions); }
+
+ const DestinationBuffer& destination() const { return m_destination; }
+
+ template <int Layout, typename Scalar>
+ void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
+ eigen_assert(dst_base != NULL);
+ m_destination =
+ DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
+ }
+
+ template <int Layout, typename Scalar, typename DstStridesIndexType>
+ void AddDestinationBuffer(
+ Scalar* dst_base,
+ const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
+ // DSizes constructor will do index type promotion if it's safe.
+ AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
+ }
+
+ TensorBlockDescriptor& DropDestinationBuffer() {
+ m_destination.m_data = NULL;
+ m_destination.m_kind = DestinationBuffer::kEmpty;
+ return *this;
+ }
+
+ bool HasDestinationBuffer() const {
+ return m_destination.kind() != DestinationBuffer::kEmpty;
+ }
+
+ // Returns a copy of `*this` with updated offset.
+ TensorBlockDescriptor WithOffset(IndexType offset) const {
+ return TensorBlockDescriptor(offset, m_dimensions, m_destination);
+ }
+
+ private:
+ // Offset and dimensions are immutable after construction. Block descriptor
+ // can only be mutated by adding or dropping destination.
+ const IndexType m_offset;
+ const Dimensions m_dimensions;
+ DestinationBuffer m_destination;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
+
+template <int NumDims, int Layout, typename IndexType = Eigen::Index>
+class TensorBlockMapper {
+ typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
+
+ public:
+ typedef DSizes<IndexType, NumDims> Dimensions;
+
+ TensorBlockMapper() = default;
+ TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
+ const TensorBlockResourceRequirements& requirements)
+ : m_tensor_dimensions(dimensions), m_requirements(requirements) {
+ // Initialize `m_block_dimensions`.
+ InitializeBlockDimensions();
+
+ // Calculate block counts by dimension and total block count.
+ DSizes<IndexType, NumDims> block_count;
+ for (int i = 0; i < NumDims; ++i) {
+ block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
+ }
+ m_total_block_count = array_prod(block_count);
+
+ // Calculate block strides (used for enumerating blocks).
+ m_tensor_strides = strides<Layout>(m_tensor_dimensions);
+ m_block_strides = strides<Layout>(block_count);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
+ return m_total_block_count;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
+ return m_block_dimensions.TotalSize();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
+ blockDimensions() const {
+ return m_block_dimensions;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ BlockDescriptor blockDescriptor(IndexType block_index) const {
+ static const bool isColMajor = Layout == static_cast<int>(ColMajor);
+
+ IndexType offset = 0;
+ DSizes<IndexType, NumDims> dimensions;
+
+ if (NumDims == 0) return BlockDescriptor(offset, dimensions);
+
+ // Iterate outer -> inner dimensions.
+ for (int i = NumDims - 1; i >= 0; --i) {
+ const int dim = isColMajor ? i : NumDims - i - 1;
+
+ const IndexType idx = block_index / m_block_strides[dim];
+ block_index -= idx * m_block_strides[dim];
+
+ const IndexType coord = idx * m_block_dimensions[dim];
+ dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
+ m_block_dimensions[dim]);
+ offset += coord * m_tensor_strides[dim];
+ }
+
+ return {offset, dimensions};
+ }
+
+ private:
+ void InitializeBlockDimensions() {
+ // Requested block shape and size.
+ const TensorBlockShapeType shape_type = m_requirements.shape_type;
+ const IndexType target_block_size =
+ numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
+
+ // Corner case: one of the dimensions is zero. Logic below is too complex
+ // to handle this case on a general basis, just use unit block size.
+ // Note: we must not yield blocks with zero dimensions (recipe for
+ // overflows/underflows, divisions by zero and NaNs later).
+ if (m_tensor_dimensions.TotalSize() == 0) {
+ for (int i = 0; i < NumDims; ++i) {
+ m_block_dimensions[i] = 1;
+ }
+ return;
+ }
+
+ // If tensor fits into a target block size, evaluate it as a single block.
+ if (m_tensor_dimensions.TotalSize() <= target_block_size) {
+ m_block_dimensions = m_tensor_dimensions;
+ return;
+ }
+
+ static const bool isColMajor = Layout == static_cast<int>(ColMajor);
+
+ // Block shape skewed towards inner dimension.
+ if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
+ IndexType coeff_to_allocate = target_block_size;
+
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = isColMajor ? i : NumDims - i - 1;
+ m_block_dimensions[dim] =
+ numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
+ coeff_to_allocate = divup(
+ coeff_to_allocate,
+ numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
+ }
+ eigen_assert(coeff_to_allocate == 1);
+
+ } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
+ // Tensor will not fit within 'target_block_size' budget: calculate tensor
+ // block dimension sizes based on "square" dimension size target.
+ const IndexType dim_size_target = convert_index<IndexType>(
+ std::pow(static_cast<float>(target_block_size),
+ 1.0f / static_cast<float>(m_block_dimensions.rank())));
+
+ for (int i = 0; i < NumDims; ++i) {
+ // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
+ // a multiple of the packet size. Note that reducing
+ // 'block_dim_size' in this manner can increase the number of
+ // blocks, and so will amplify any per-block overhead.
+ m_block_dimensions[i] =
+ numext::mini(dim_size_target, m_tensor_dimensions[i]);
+ }
+
+ // Add any un-allocated coefficients to inner dimension(s).
+ IndexType total_size = m_block_dimensions.TotalSize();
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = isColMajor ? i : NumDims - i - 1;
+
+ if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
+ const IndexType total_size_other_dims =
+ total_size / m_block_dimensions[dim];
+ const IndexType alloc_avail =
+ divup<IndexType>(target_block_size, total_size_other_dims);
+ if (alloc_avail == m_block_dimensions[dim]) {
+ // Insufficient excess coefficients to allocate.
+ break;
+ }
+ m_block_dimensions[dim] =
+ numext::mini(m_tensor_dimensions[dim], alloc_avail);
+ total_size = total_size_other_dims * m_block_dimensions[dim];
+ }
+ }
+
+ } else {
+ eigen_assert(false); // unknown block shape
+ }
+
+ eigen_assert(m_block_dimensions.TotalSize() >=
+ numext::mini<IndexType>(target_block_size,
+ m_tensor_dimensions.TotalSize()));
+ }
+
+ DSizes<IndexType, NumDims> m_tensor_dimensions;
+ TensorBlockResourceRequirements m_requirements;
+
+ DSizes<IndexType, NumDims> m_block_dimensions;
+ IndexType m_total_block_count;
+
+ DSizes<IndexType, NumDims> m_tensor_strides;
+ DSizes<IndexType, NumDims> m_block_strides;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockScratchAllocator is responsible for allocating temporary buffers
+// for block evaluation (output or input block materialization). Given that
+// Eigen expression traversal order is deterministic, all temporary allocations
+// are happening in the same order, and usually have exactly the same size.
+// Scratch allocator keeps a trace of all dynamic allocations, and after the
+// first block evaluation is completed, we should be able to reuse all the
+// temporary buffers for the next block evaluation.
+
+template <typename Device>
+class TensorBlockScratchAllocator {
+ public:
+ explicit TensorBlockScratchAllocator(const Device& device)
+ : m_device(device), m_allocation_index(0) {}
+
+ ~TensorBlockScratchAllocator() {
+ for (size_t i = 0; i < m_allocations.size(); ++i) {
+ m_device.deallocate(m_allocations[i].ptr);
+ }
+ }
+
+ void* allocate(size_t size) {
+ // TODO(ezhulenev): Remove when replaced with inlined vector.
+ if (m_allocations.capacity() == 0) m_allocations.reserve(8);
+
+ // Check if we already have an existing allocation att current index.
+ const int num_allocations = static_cast<int>(m_allocations.size());
+ const bool has_allocation = m_allocation_index < num_allocations;
+
+ // Allocation index can't be larger than the number of allocations.
+ eigen_assert(m_allocation_index <= num_allocations);
+
+ // If we have existing allocation, and its size is larger or equal to
+ // requested size, we do nothing.
+
+ // If current allocation can't fit requested size, we deallocate it, and
+ // replace with a larger allocation.
+ if (has_allocation && m_allocations[m_allocation_index].size < size) {
+ m_device.deallocate(m_allocations[m_allocation_index].ptr);
+ m_allocations[m_allocation_index].ptr = m_device.allocate(size);
+ m_allocations[m_allocation_index].size = size;
+ }
+
+ // Make a new allocation if we don't have and existing one.
+ if (!has_allocation) {
+ Allocation allocation;
+ allocation.ptr = m_device.allocate(size);
+ allocation.size = size;
+ m_allocations.push_back(allocation);
+ }
+
+ eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
+ eigen_assert(m_allocations[m_allocation_index].size >= size);
+
+ return m_allocations[m_allocation_index++].ptr;
+ }
+
+ void reset() { m_allocation_index = 0; }
+
+ private:
+ struct Allocation {
+ void* ptr;
+ size_t size;
+ };
+
+ const Device& m_device;
+ int m_allocation_index;
+ // TODO(ezhulenev): This should be an inlined vector.
+ std::vector<Allocation> m_allocations;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockKind represents all possible block kinds, that can be produced by
+// TensorEvaluator::evalBlock function.
+enum TensorBlockKind {
+ // Tensor block that is a lazy expression that must be assigned to a
+ // destination using TensorBlockAssign.
+ kExpr,
+
+ // Tensor block that is a view into a memory buffer owned by an underlying
+ // Tensor expression (e.g. it can be a view into a Tensor buffer).
+ kView,
+
+ // Tensor block that was materialized in a scratch memory buffer, allocated
+ // with TensorBlockScratchAllocator. This block must be copied to a
+ // destination, similar to a block of `kExpr` type.
+ kMaterializedInScratch,
+
+ // Tensor block that was materialized directly into the final output memory
+ // buffer. For example if the left side of an assignment is a Tensor, we can
+ // directly materialize the block in the destination memory.
+ //
+ // If strides in the output buffer do not match tensor block strides, the
+ // Tensor expression will be invalid, and should not be used by
+ // TensorBlockAssign or for constructing another block expression.
+ kMaterializedInOutput
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
+// TensorEvaluators that do not support block evaluation.
+
+class TensorBlockNotImplemented {
+ public:
+ typedef void XprType;
+};
+
+// -------------------------------------------------------------------------- //
+// XprScalar extracts Scalar type from the Eigen expressions (if expression type
+// is not void). It's required to be able to define lazy block expression for
+// argument types, that do not support block evaluation.
+
+template <typename XprType>
+struct XprScalar {
+ typedef typename XprType::Scalar type;
+};
+template <>
+struct XprScalar<void> {
+ typedef void type;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorMaterializedBlock is a fully evaluated block of the original tensor,
+// and XprType is just a TensorMap over the data. This block type is typically
+// used to materialize blocks of tensor expressions, that can't be efficiently
+// represented as lazy Tensor expressions with fast coeff/packet operations,
+// e.g. we materialize all broadcasts into evaluated blocks.
+//
+// TensorMaterializedBlock does not own its memory buffer, it's either a memory
+// buffer that backs the original expression (e.g. block is just a view into a
+// Tensor), or a memory buffer allocated with scratch allocator, and in this
+// case the scratch allocator will deallocate it at the end of block based
+// expression execution.
+//
+// If the block was evaluated directly into the output buffer, and strides in
+// the output buffer do not match block strides, the TensorMap expression will
+// be invalid, and should never be used in block assignment or any other tensor
+// expression.
+
+template <typename Scalar, int NumDims, int Layout,
+ typename IndexType = Eigen::Index>
+class TensorMaterializedBlock {
+ public:
+ typedef DSizes<IndexType, NumDims> Dimensions;
+ typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
+
+ TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
+ const Dimensions& dimensions, bool valid_expr = true)
+ : m_kind(kind),
+ m_data(data),
+ m_dimensions(dimensions),
+ m_expr(m_data, m_dimensions),
+ m_valid_expr(valid_expr) {
+ eigen_assert(m_kind == internal::TensorBlockKind::kView ||
+ m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
+ m_kind == internal::TensorBlockKind::kMaterializedInOutput);
+ }
+
+ TensorBlockKind kind() const { return m_kind; }
+ // NOTE(ezhulenev): Returning XprType by value like in other block types
+ // causes asan failures. The theory is that XprType::Nested doesn't work
+ // properly for TensorMap.
+ const XprType& expr() const {
+ eigen_assert(m_valid_expr);
+ return m_expr;
+ }
+ const Scalar* data() const { return m_data; }
+ void cleanup() {}
+
+ typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+
+ // TensorMaterializedBlock can be backed by different types of storage:
+ //
+ // (1) Contiguous block of memory allocated with scratch allocator.
+ // (2) Contiguous block of memory reused from tensor block descriptor
+ // destination buffer.
+ // (3) Strided block of memory reused from tensor block descriptor
+ // destination buffer.
+ //
+ class Storage {
+ public:
+ Scalar* data() const { return m_data; }
+ const Dimensions& dimensions() const { return m_dimensions; }
+ const Dimensions& strides() const { return m_strides; }
+
+ TensorMaterializedBlock AsTensorMaterializedBlock() const {
+ return TensorMaterializedBlock(
+ m_materialized_in_output
+ ? internal::TensorBlockKind::kMaterializedInOutput
+ : internal::TensorBlockKind::kMaterializedInScratch,
+ m_data, m_dimensions, !m_strided_storage);
+ }
+
+ private:
+ friend class TensorMaterializedBlock;
+
+ Storage(Scalar* data, const Dimensions& dimensions,
+ const Dimensions& strides, bool materialized_in_output,
+ bool strided_storage)
+ : m_data(data),
+ m_dimensions(dimensions),
+ m_strides(strides),
+ m_materialized_in_output(materialized_in_output),
+ m_strided_storage(strided_storage) {}
+
+ Scalar* m_data;
+ Dimensions m_dimensions;
+ Dimensions m_strides;
+ bool m_materialized_in_output;
+ bool m_strided_storage;
+ };
+
+ // Creates a storage for materialized block either from the block descriptor
+ // destination buffer, or allocates a new buffer with scratch allocator.
+ template <typename TensorBlockScratch>
+ EIGEN_STRONG_INLINE static Storage prepareStorage(
+ TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool allow_strided_storage = false) {
+ // Try to reuse destination as an output block buffer.
+ typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
+
+ if (desc.destination().kind() == DestinationBuffer::kContiguous) {
+ Scalar* buffer = desc.destination().template data<Scalar>();
+ desc.DropDestinationBuffer();
+ return Storage(buffer, desc.dimensions(),
+ internal::strides<Layout>(desc.dimensions()),
+ /*materialized_in_output=*/true,
+ /*strided_storage=*/false);
+
+ } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
+ allow_strided_storage) {
+ Scalar* buffer = desc.destination().template data<Scalar>();
+ desc.DropDestinationBuffer();
+ return Storage(buffer, desc.dimensions(), desc.destination().strides(),
+ /*materialized_in_output=*/true, /*strided_storage=*/true);
+
+ } else {
+ void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
+ return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
+ internal::strides<Layout>(desc.dimensions()),
+ /*materialized_in_output=*/false,
+ /*strided_storage=*/false);
+ }
+ }
+
+ // Creates a materialized block for the given descriptor from a memory buffer.
+ template <typename DataDimensions, typename TensorBlockScratch>
+ EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
+ const Scalar* data, const DataDimensions& data_dims,
+ TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+ eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
+
+ // If a tensor block dimensions covers a contiguous block of the underlying
+ // memory, we can skip block buffer memory allocation, and construct a block
+ // from existing `data` memory buffer.
+ //
+ // Example: (RowMajor layout)
+ // data_dims: [11, 12, 13, 14]
+ // desc.dimensions(): [1, 1, 3, 14]
+ //
+ // In this case we can construct a TensorBlock starting at
+ // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
+ static const bool is_col_major = Layout == ColMajor;
+
+ // Find out how many inner dimensions have a matching size.
+ int num_matching_inner_dims = 0;
+ for (int i = 0; i < NumDims; ++i) {
+ int dim = is_col_major ? i : NumDims - i - 1;
+ if (data_dims[dim] != desc.dimensions()[dim]) break;
+ ++num_matching_inner_dims;
+ }
+
+ // All the outer dimensions must be of size `1`, except a single dimension
+ // before the matching inner dimension (`3` in the example above).
+ bool can_use_direct_access = true;
+ for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
+ int dim = is_col_major ? i : NumDims - i - 1;
+ if (desc.dimension(dim) != 1) {
+ can_use_direct_access = false;
+ break;
+ }
+ }
+
+ if (can_use_direct_access) {
+ const Scalar* block_start = data + desc.offset();
+ return TensorMaterializedBlock(internal::TensorBlockKind::kView,
+ block_start, desc.dimensions());
+
+ } else {
+ // Reuse destination buffer or allocate new buffer with scratch allocator.
+ const Storage storage = prepareStorage(desc, scratch);
+
+ typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
+ TensorBlockIO;
+ typedef typename TensorBlockIO::Dst TensorBlockIODst;
+ typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+ TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
+ data, desc.offset());
+ TensorBlockIODst dst(storage.dimensions(), storage.strides(),
+ storage.data());
+
+ TensorBlockIO::Copy(dst, src);
+ return storage.AsTensorMaterializedBlock();
+ }
+ }
+
+ private:
+ TensorBlockKind m_kind;
+ const Scalar* m_data;
+ Dimensions m_dimensions;
+ XprType m_expr;
+ bool m_valid_expr;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
+// functor to the blocks produced by the underlying Tensor expression.
+
+template <typename UnaryOp, typename ArgTensorBlock>
+class TensorCwiseUnaryBlock {
+
+ static const bool NoArgBlockAccess =
+ internal::is_void<typename ArgTensorBlock::XprType>::value;
+
+ public:
+ typedef typename conditional<
+ NoArgBlockAccess, void,
+ TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
+ type XprType;
+
+ typedef typename XprScalar<XprType>::type Scalar;
+
+ TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
+ : m_arg_block(arg_block), m_functor(functor) {}
+
+ TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+
+ XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
+ const Scalar* data() const { return NULL; }
+ void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+ ArgTensorBlock m_arg_block;
+ UnaryOp m_functor;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
+// functor to the blocks produced by the underlying Tensor expression.
+
+template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
+class TensorCwiseBinaryBlock {
+
+ static const bool NoArgBlockAccess =
+ internal::is_void<typename LhsTensorBlock::XprType>::value ||
+ internal::is_void<typename RhsTensorBlock::XprType>::value;
+
+ public:
+ typedef typename conditional<
+ NoArgBlockAccess, void,
+ TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
+ const typename RhsTensorBlock::XprType> >::type
+ XprType;
+
+ typedef typename XprScalar<XprType>::type Scalar;
+
+ TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
+ const RhsTensorBlock& right_block,
+ const BinaryOp& functor)
+ : m_left_block(left_block),
+ m_right_block(right_block),
+ m_functor(functor) {}
+
+ TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+
+ XprType expr() const {
+ return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
+ }
+
+ const Scalar* data() const { return NULL; }
+
+ void cleanup() {
+ m_left_block.cleanup();
+ m_right_block.cleanup();
+ }
+
+ private:
+ LhsTensorBlock m_left_block;
+ RhsTensorBlock m_right_block;
+ BinaryOp m_functor;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorUnaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from a block of the underlying type (this is a
+// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
+
+template <typename BlockFactory, typename ArgTensorBlock>
+class TensorUnaryExprBlock {
+
+ typedef typename ArgTensorBlock::XprType ArgXprType;
+ static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
+
+ public:
+ typedef typename conditional<
+ NoArgBlockAccess, void,
+ typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
+
+ typedef typename XprScalar<XprType>::type Scalar;
+
+ TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
+ const BlockFactory& factory)
+ : m_arg_block(arg_block), m_factory(factory) {}
+
+ TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+ XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
+ const Scalar* data() const { return NULL; }
+ void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+ ArgTensorBlock m_arg_block;
+ BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorTernaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from three blocks of the underlying type.
+
+template <typename BlockFactory, typename Arg1TensorBlock,
+ typename Arg2TensorBlock, typename Arg3TensorBlock>
+class TensorTernaryExprBlock {
+
+ typedef typename Arg1TensorBlock::XprType Arg1XprType;
+ typedef typename Arg2TensorBlock::XprType Arg2XprType;
+ typedef typename Arg3TensorBlock::XprType Arg3XprType;
+
+ static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
+ internal::is_void<Arg2XprType>::value ||
+ internal::is_void<Arg3XprType>::value;
+
+ public:
+ typedef typename conditional<
+ NoArgBlockAccess, void,
+ typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
+ Arg3XprType>::type>::type XprType;
+
+ typedef typename XprScalar<XprType>::type Scalar;
+
+ TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
+ const Arg2TensorBlock& arg2_block,
+ const Arg3TensorBlock& arg3_block,
+ const BlockFactory& factory)
+ : m_arg1_block(arg1_block),
+ m_arg2_block(arg2_block),
+ m_arg3_block(arg3_block),
+ m_factory(factory) {}
+
+ TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+ XprType expr() const {
+ return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
+ m_arg3_block.expr());
+ }
+ const Scalar* data() const { return NULL; }
+ void cleanup() {
+ m_arg1_block.cleanup();
+ m_arg2_block.cleanup();
+ m_arg3_block.cleanup();
+ }
+
+ private:
+ Arg1TensorBlock m_arg1_block;
+ Arg2TensorBlock m_arg2_block;
+ Arg3TensorBlock m_arg3_block;
+ BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// StridedLinearBufferCopy provides a method to copy data between two linear
+// buffers with different strides, with optimized paths for scatter/gather.
+
+template <typename Scalar, typename IndexType>
+class StridedLinearBufferCopy {
+ typedef typename packet_traits<Scalar>::type Packet;
+ enum {
+ Vectorizable = packet_traits<Scalar>::Vectorizable,
+ PacketSize = packet_traits<Scalar>::size
+ };
+
+ public:
+ // Specifying linear copy kind statically gives ~30% speedup for small sizes.
+ enum Kind {
+ Linear = 0, // src_stride == 1 && dst_stride == 1
+ Scatter = 1, // src_stride == 1 && dst_stride != 1
+ FillLinear = 2, // src_stride == 0 && dst_stride == 1
+ FillScatter = 3, // src_stride == 0 && dst_stride != 1
+ Gather = 4, // dst_stride == 1
+ Random = 5 // everything else
+ };
+
+ struct Dst {
+ Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
+
+ IndexType offset;
+ IndexType stride;
+ Scalar* data;
+ };
+
+ struct Src {
+ Src(IndexType o, IndexType s, const Scalar* d)
+ : offset(o), stride(s), data(d) {}
+
+ IndexType offset;
+ IndexType stride;
+ const Scalar* data;
+ };
+
+ template <StridedLinearBufferCopy::Kind kind>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
+ const Src& src,
+ const size_t count) {
+ Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
+ src.data);
+ }
+
+ private:
+ template <StridedLinearBufferCopy::Kind kind>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const IndexType count, const IndexType dst_offset,
+ const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
+ const IndexType src_offset, const IndexType src_stride,
+ const Scalar* EIGEN_RESTRICT src_data) {
+ const Scalar* src = &src_data[src_offset];
+ Scalar* dst = &dst_data[dst_offset];
+
+ if (!Vectorizable) {
+ for (Index i = 0; i < count; ++i) {
+ dst[i * dst_stride] = src[i * src_stride];
+ }
+ return;
+ }
+
+ const IndexType vectorized_size = count - PacketSize;
+ IndexType i = 0;
+
+ if (kind == Linear) {
+ // ******************************************************************** //
+ // Linear copy from `src` to `dst`.
+ const IndexType unrolled_size = count - 4 * PacketSize;
+ eigen_assert(src_stride == 1 && dst_stride == 1);
+ for (; i <= unrolled_size; i += 4 * PacketSize) {
+ for (int j = 0; j < 4; ++j) {
+ Packet p = ploadu<Packet>(src + i + j * PacketSize);
+ pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+ }
+ }
+ for (; i <= vectorized_size; i += PacketSize) {
+ Packet p = ploadu<Packet>(src + i);
+ pstoreu<Scalar, Packet>(dst + i, p);
+ }
+ for (; i < count; ++i) {
+ dst[i] = src[i];
+ }
+ // ******************************************************************** //
+ } else if (kind == Scatter) {
+ // Scatter from `src` to `dst`.
+ eigen_assert(src_stride == 1 && dst_stride != 1);
+ for (; i <= vectorized_size; i += PacketSize) {
+ Packet p = ploadu<Packet>(src + i);
+ pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
+ }
+ for (; i < count; ++i) {
+ dst[i * dst_stride] = src[i];
+ }
+ // ******************************************************************** //
+ } else if (kind == FillLinear) {
+ // Fill `dst` with value at `*src`.
+ eigen_assert(src_stride == 0 && dst_stride == 1);
+ const IndexType unrolled_size = count - 4 * PacketSize;
+ Packet p = pload1<Packet>(src);
+ for (; i <= unrolled_size; i += 4 * PacketSize) {
+ for (int j = 0; j < 4; ++j) {
+ pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+ }
+ }
+ for (; i <= vectorized_size; i += PacketSize) {
+ pstoreu<Scalar, Packet>(dst + i, p);
+ }
+ for (; i < count; ++i) {
+ dst[i] = *src;
+ }
+ // ******************************************************************** //
+ } else if (kind == FillScatter) {
+ // Scatter `*src` into `dst`.
+ eigen_assert(src_stride == 0 && dst_stride != 1);
+ Packet p = pload1<Packet>(src);
+ for (; i <= vectorized_size; i += PacketSize) {
+ pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
+ }
+ for (; i < count; ++i) {
+ dst[i * dst_stride] = *src;
+ }
+ // ******************************************************************** //
+ } else if (kind == Gather) {
+ // Gather from `src` into `dst`.
+ eigen_assert(dst_stride == 1);
+ for (; i <= vectorized_size; i += PacketSize) {
+ Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
+ pstoreu<Scalar, Packet>(dst + i, p);
+ }
+ for (; i < count; ++i) {
+ dst[i] = src[i * src_stride];
+ }
+ // ******************************************************************** //
+ } else if (kind == Random) {
+ // Random.
+ for (; i < count; ++i) {
+ dst[i * dst_stride] = src[i * src_stride];
+ }
+ } else {
+ eigen_assert(false);
+ }
+ }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
+// It's possible to specify src->dst dimension mapping for the copy operation.
+// Dimensions of `dst` specify how many elements have to be copied, for the
+// `src` we need to know only stride to navigate through source memory buffer.
+
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIO {
+ static const bool IsColMajor = (Layout == ColMajor);
+
+ typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
+
+ public:
+ typedef DSizes<IndexType, NumDims> Dimensions;
+ typedef DSizes<int, NumDims> DimensionsMap;
+
+ struct Dst {
+ Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
+ IndexType dst_offset = 0)
+ : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
+
+ Dimensions dims;
+ Dimensions strides;
+ Scalar* data;
+ IndexType offset;
+ };
+
+ struct Src {
+ Src(const Dimensions& src_strides, const Scalar* src,
+ IndexType src_offset = 0)
+ : strides(src_strides), data(src), offset(src_offset) {}
+
+ Dimensions strides;
+ const Scalar* data;
+ IndexType offset;
+ };
+
+ // Copies data to `dst` from `src`, using provided dimensions mapping:
+ //
+ // src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
+ //
+ // Returns the number of copied elements.
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
+ const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
+ // Copy single scalar value from `src` to `dst`.
+ if (NumDims == 0) {
+ *(dst.data + dst.offset) = *(src.data + src.offset);
+ return 1;
+ }
+
+ // Both `dst` and `src` must have contiguous innermost dimension. We also
+ // accept the special case with stride '0', because it's used as a trick to
+ // implement broadcasting.
+ {
+ int inner_dim = IsColMajor ? 0 : NumDims - 1;
+ EIGEN_UNUSED_VARIABLE(inner_dim);
+ eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
+ eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
+ }
+
+ // Give a shorter name to `dst_to_src_dim_map`.
+ const DimensionsMap& dim_map = dst_to_src_dim_map;
+
+ // Do not squeeze reordered inner dimensions.
+ int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
+
+ // NOTE: We find the innermost dimension (contiguous in memory) in the dst
+ // block, and we write data linearly into that dimension, reading it from
+ // the src. If dimensions are reordered, we might end up reading data from
+ // the src with `stride != 1`.
+ //
+ // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
+ // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
+
+ // Find the innermost dimension in the dst whose size is not 1. This is the
+ // effective inner dim.
+ int num_size_one_inner_dims = 0;
+ for (int i = 0; i < num_squeezable_dims; ++i) {
+ const int dst_dim = IsColMajor ? i : NumDims - i - 1;
+ if (dst.dims[dst_dim] != 1) break;
+ num_size_one_inner_dims++;
+ }
+
+ // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
+ if (num_size_one_inner_dims == NumDims) {
+ *(dst.data + dst.offset) = *(src.data + src.offset);
+ return 1;
+ }
+
+ // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
+ const int dst_stride1_dim = IsColMajor
+ ? num_size_one_inner_dims
+ : NumDims - num_size_one_inner_dims - 1;
+
+ // Dimension in the src that corresponds to the dst innermost dimension.
+ const int src_dim_for_dst_stride1_dim =
+ NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
+
+ // Size of the innermost dimension (length of contiguous blocks of memory).
+ IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
+
+ // Squeeze multiple inner dims into one if they are contiguous in `dst` and
+ // `src` memory, so we can do less linear copy calls.
+ for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
+ const int dst_dim = IsColMajor ? i : NumDims - i - 1;
+ const IndexType dst_stride = dst.strides[dst_dim];
+ const IndexType src_stride = src.strides[dim_map[dst_dim]];
+ if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
+ dst_inner_dim_size *= dst.dims[dst_dim];
+ ++num_size_one_inner_dims;
+ } else {
+ break;
+ }
+ }
+
+ // Setup strides to read data from `src` and write to `dst`.
+ IndexType input_offset = src.offset;
+ IndexType output_offset = dst.offset;
+ IndexType input_stride =
+ NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
+ IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
+
+ const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
+ array<BlockIteratorState, at_least_1_dim> it;
+
+ // Initialize block iterator state. Squeeze away any dimension of size 1.
+ int idx = 0; // currently initialized iterator state index
+ for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
+ const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
+ if (dst.dims[dst_dim] == 1) continue;
+
+ it[idx].size = dst.dims[dst_dim];
+ it[idx].input_stride = src.strides[dim_map[dst_dim]];
+ it[idx].output_stride = dst.strides[dst_dim];
+
+ it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
+ it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+
+ idx++;
+ }
+
+ // Iterate copying data from src to dst.
+ const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
+
+#define COPY_INNER_DIM(KIND) \
+ IndexType num_copied = 0; \
+ for (num_copied = 0; num_copied < block_total_size; \
+ num_copied += dst_inner_dim_size) { \
+ LinCopy::template Run<KIND>( \
+ typename LinCopy::Dst(output_offset, output_stride, dst.data), \
+ typename LinCopy::Src(input_offset, input_stride, src.data), \
+ dst_inner_dim_size); \
+ \
+ for (int j = 0; j < idx; ++j) { \
+ if (++it[j].count < it[j].size) { \
+ input_offset += it[j].input_stride; \
+ output_offset += it[j].output_stride; \
+ break; \
+ } \
+ it[j].count = 0; \
+ input_offset -= it[j].input_span; \
+ output_offset -= it[j].output_span; \
+ } \
+ } \
+ return num_copied;
+
+ if (input_stride == 1 && output_stride == 1) {
+ COPY_INNER_DIM(LinCopy::Linear);
+ } else if (input_stride == 1 && output_stride != 1) {
+ COPY_INNER_DIM(LinCopy::Scatter);
+ } else if (input_stride == 0 && output_stride == 1) {
+ COPY_INNER_DIM(LinCopy::FillLinear);
+ } else if (input_stride == 0 && output_stride != 1) {
+ COPY_INNER_DIM(LinCopy::FillScatter);
+ } else if (output_stride == 1) {
+ COPY_INNER_DIM(LinCopy::Gather);
+ } else {
+ COPY_INNER_DIM(LinCopy::Random);
+ }
+
+#undef COPY_INNER_DIM
+ }
+
+ // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
+ // the number of copied elements.
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
+ const Src& src) {
+ DimensionsMap dst_to_src_map;
+ for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
+ return Copy(dst, src, dst_to_src_map);
+ }
+
+ private:
+ struct BlockIteratorState {
+ BlockIteratorState()
+ : size(0),
+ count(0),
+ input_stride(0),
+ output_stride(0),
+ input_span(0),
+ output_span(0) {}
+
+ IndexType size;
+ IndexType count;
+ IndexType input_stride;
+ IndexType output_stride;
+ IndexType input_span;
+ IndexType output_span;
+ };
+
+ // Compute how many inner dimensions it's allowed to squeeze when doing IO
+ // between two tensor blocks. It's safe to squeeze inner dimensions, only
+ // if they are not reordered.
+ static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
+ int num_squeezable_dims = 0;
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = IsColMajor ? i : NumDims - i - 1;
+ if (dim_map[dim] != dim) break;
+ num_squeezable_dims++;
+ }
+ return num_squeezable_dims;
+ }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
+// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
+//
+// Currently there is no way to write from a Tensor expression to a block of
+// memory, if dimensions are reordered. If you need to do that, you should
+// materialize a Tensor block expression into a memory buffer, and then use
+// TensorBlockIO to copy data between two memory buffers with a custom
+// `target->src` dimension map (see definition above).
+//
+// Also currently the innermost dimension of `target` must have a stride '1'
+// (contiguous in memory). This restriction could be lifted with a `pscatter`,
+// but in practice it's never needed, and there is a similar TensorBlockIO
+// workaround for that.
+//
+// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
+// where `src` is a tensor expression. Explore if it is possible to rewrite IO
+// to use expressions instead of pointers, and after that TensorBlockAssignment
+// will become an alias to IO.
+template <typename Scalar, int NumDims, typename TensorBlockExpr,
+ typename IndexType = Eigen::Index>
+class TensorBlockAssignment {
+ // We will use coeff/packet path to evaluate block expressions.
+ typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
+ TensorBlockEvaluator;
+
+ typedef DSizes<IndexType, NumDims> Dimensions;
+
+ enum {
+ Vectorizable = packet_traits<Scalar>::Vectorizable,
+ PacketSize = packet_traits<Scalar>::size
+ };
+
+ template <bool Vectorizable, typename Evaluator>
+ struct InnerDimAssign {
+ EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
+ const Evaluator& eval,
+ IndexType eval_offset) {
+ for (IndexType i = 0; i < count; ++i) {
+ target[i] = eval.coeff(eval_offset + i);
+ }
+ }
+ };
+
+ template <typename Evaluator>
+ struct InnerDimAssign<true, Evaluator> {
+ EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
+ const Evaluator& eval,
+ IndexType eval_offset) {
+ typedef typename packet_traits<Scalar>::type Packet;
+
+ const IndexType unrolled_size = count - 4 * PacketSize;
+ const IndexType vectorized_size = count - PacketSize;
+ IndexType i = 0;
+
+ for (; i <= unrolled_size; i += 4 * PacketSize) {
+ for (int j = 0; j < 4; ++j) {
+ const IndexType idx = eval_offset + i + j * PacketSize;
+ Packet p = eval.template packet<Unaligned>(idx);
+ pstoreu<Scalar>(target + i + j * PacketSize, p);
+ }
+ }
+
+ for (; i <= vectorized_size; i += PacketSize) {
+ Packet p = eval.template packet<Unaligned>(eval_offset + i);
+ pstoreu<Scalar>(target + i, p);
+ }
+
+ for (; i < count; ++i) {
+ target[i] = eval.coeff(eval_offset + i);
+ }
+ }
+ };
+
+ public:
+ struct Target {
+ Target(const Dimensions& target_dims, const Dimensions& target_strides,
+ Scalar* target_data, IndexType target_offset = 0)
+ : dims(target_dims),
+ strides(target_strides),
+ data(target_data),
+ offset(target_offset) {}
+
+ Dimensions dims;
+ Dimensions strides;
+ Scalar* data;
+ IndexType offset;
+ };
+
+ static Target target(const Dimensions& target_dims,
+ const Dimensions& target_strides, Scalar* target_data,
+ IndexType target_offset = 0) {
+ return Target(target_dims, target_strides, target_data, target_offset);
+ }
+
+ template <typename TargetDimsIndexType, typename TargetStridesIndexType>
+ static Target target(
+ const DSizes<TargetDimsIndexType, NumDims>& target_dims,
+ const DSizes<TargetStridesIndexType, NumDims>& target_strides,
+ Scalar* target_data, IndexType target_offset = 0) {
+ // DSizes constructor will do index type promotion if it's safe.
+ return Target(Dimensions(target_dims), Dimensions(target_strides),
+ target_data, target_offset);
+ }
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const Target& target, const TensorBlockExpr& expr) {
+ // Prepare evaluator for block expression.
+ DefaultDevice default_device;
+ TensorBlockEvaluator eval(expr, default_device);
+
+ // Tensor block expression dimension should match destination dimensions.
+ eigen_assert(dimensions_match(target.dims, eval.dimensions()));
+
+ static const int Layout = TensorBlockEvaluator::Layout;
+ static const bool is_col_major = Layout == ColMajor;
+
+ // Initialize output inner dimension size based on a layout.
+ const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
+ const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
+ IndexType output_inner_dim_size = target.dims[inner_dim_idx];
+
+ // Target inner dimension stride must be '1'.
+ eigen_assert(target.strides[inner_dim_idx] == 1);
+
+ // Squeeze multiple inner dims into one if they are contiguous in `target`.
+ IndexType num_squeezed_dims = 0;
+ for (Index i = 1; i < NumDims; ++i) {
+ const Index dim = is_col_major ? i : NumDims - i - 1;
+ const IndexType target_stride = target.strides[dim];
+
+ if (output_inner_dim_size == target_stride) {
+ output_inner_dim_size *= target.dims[dim];
+ num_squeezed_dims++;
+ } else {
+ break;
+ }
+ }
+
+ // Initialize output block iterator state. Dimension in this array are
+ // always in inner_most -> outer_most order (col major layout).
+ array<BlockIteratorState, NumDims> it;
+
+ int idx = 0; // currently initialized iterator state index
+ for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
+ const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
+
+ it[idx].count = 0;
+ it[idx].size = target.dims[dim];
+ it[idx].output_stride = target.strides[dim];
+ it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+ idx++;
+ }
+
+ // We read block expression from the beginning, and start writing data to
+ // `target` at given offset.
+ IndexType input_offset = 0;
+ IndexType output_offset = target.offset;
+
+ // Iterate copying data from `eval` to `target`.
+ for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
+ // Assign to `target` at current offset.
+ InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
+ TensorBlockEvaluator>::Run(target.data + output_offset,
+ output_inner_dim_size, eval,
+ input_offset);
+
+ // Move input offset forward by the number of assigned coefficients.
+ input_offset += output_inner_dim_size;
+
+ // Update index.
+ for (int j = 0; j < idx; ++j) {
+ if (++it[j].count < it[j].size) {
+ output_offset += it[j].output_stride;
+ break;
+ }
+ it[j].count = 0;
+ output_offset -= it[j].output_span;
+ }
+ }
+ }
+
+ private:
+ struct BlockIteratorState {
+ BlockIteratorState()
+ : count(0), size(0), output_stride(0), output_span(0) {}
+
+ IndexType count;
+ IndexType size;
+ IndexType output_stride;
+ IndexType output_span;
+ };
+};
+
+// -------------------------------------------------------------------------- //
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H