From c97b208468ccb2e6414fb4086ed997b5f1903d90 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 24 Sep 2019 15:17:35 -0700 Subject: Add new TensorBlock api implementation + tests --- unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h | 960 +++++++++++++++++++++ unsupported/test/cxx11_tensor_block_eval.cpp | 339 ++++++++ unsupported/test/cxx11_tensor_block_io.cpp | 438 ++++++++++ 3 files changed, 1737 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h create mode 100644 unsupported/test/cxx11_tensor_block_eval.cpp create mode 100644 unsupported/test/cxx11_tensor_block_io.cpp (limited to 'unsupported') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h new file mode 100644 index 000000000..ef1e4d417 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h @@ -0,0 +1,960 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H +#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H + +namespace Eigen { +namespace internal { + +// -------------------------------------------------------------------------- // +// Helper function to compute strides for densely stored buffer of given +// dimensions. + +// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use +// this function instead everywhere. +template +EIGEN_STRONG_INLINE DSizes strides( + const DSizes& dimensions) { + DSizes strides; + if (NumDims == 0) return strides; + + // TODO(ezhulenev): Use templates to unroll this loop (similar to + // h_array_reduce in CXX11meta.h)? Benchmark it. + if (static_cast(Layout) == static_cast(ColMajor)) { + strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + strides[i] = strides[i - 1] * dimensions[i - 1]; + } + } else { + strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dimensions[i + 1]; + } + } + + return strides; +} + +// -------------------------------------------------------------------------- // +// TensorBlockDescriptor specifies a block offset within a tensor and the block +// sizes along each of the tensor dimensions. + +template +class TensorBlockDescriptor { + public: + typedef DSizes Dimensions; + + // If we evaluate a Tensor assignment, and expression on the left, already has + // a memory buffer, then we might do performance optimization, and evaluate + // the root expression directly into the memory, or maybe use it as temporary + // storage for some of the subexpressions, to avoid dynamic memory allocation. + // + // This is a type erased storage, because passing Scalar type through all the + // expression evaluation layers it way too many templates. Also it should be + // possible to use this destination as a temp buffer for materializing + // expressions with type, not matching the final output. + class DestinationBuffer { + public: + template + Scalar* data() const { + return static_cast(m_data); + } + + private: + friend class TensorBlockDescriptor; + + DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {} + + template + DestinationBuffer(Scalar* data, const Dimensions& dimensions, + const Dimensions& strides, size_t total_dst_bytes) + : m_data(static_cast(data)), + m_dimensions(dimensions), + m_strides(strides), + m_total_dst_bytes(total_dst_bytes) { + // TODO(ezhulenev): Benchmark template meta-unroll for this loop. + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] *= sizeof(Scalar); + m_strides[i] *= sizeof(Scalar); + } + } + + // Returns true if the tensor block corresponding to `desc` fits into the + // contiguous block of memory defined by `*this`. + template + bool fitsContiguously(const TensorBlockDescriptor& desc) const { + if (m_data == NULL) return false; + + const Dimensions& desc_dims = desc.dimensions(); + const Dimensions& dst_dims = dimensions(); + + if (!dimensions_match(desc_dims, dst_dims)) return false; + + const Dimensions& desc_strides = internal::strides(desc_dims); + const Dimensions& dst_strides = internal::strides(dst_dims); + + return dimensions_match(desc_strides, dst_strides); + } + + template + Dimensions dimensions() const { + Dimensions dimensions; + for (int i = 0; i < NumDims; ++i) { + eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0); + dimensions[i] = m_dimensions[i] / sizeof(Scalar); + } + return dimensions; + } + + template + Dimensions strides() const { + Dimensions strides; + for (int i = 0; i < NumDims; ++i) { + eigen_assert(m_strides[i] % sizeof(Scalar) == 0); + strides[i] = m_strides[i] / sizeof(Scalar); + } + return strides; + } + + void* m_data; + Dimensions m_dimensions; + Dimensions m_strides; + + // Total size of the memory buffer at the destination (typically the total + // size of the left hand side of an assignment expression). This can be the + // same as `array_prod(m_dimensions)` if the assignment target has just a + // single block, but typically it's a larger number. + size_t m_total_dst_bytes; + }; + + TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, + const DestinationBuffer& destination) + : m_offset(offset), + m_dimensions(dimensions), + m_destination(destination) {} + + TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions) + : m_offset(offset), + m_dimensions(dimensions), + m_destination(DestinationBuffer()) {} + + IndexType offset() const { return m_offset; } + const Dimensions& dimensions() const { return m_dimensions; } + IndexType dimension(int index) const { return m_dimensions[index]; } + IndexType size() const { return array_prod(m_dimensions); } + + template + void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides, + size_t total_dst_bytes) { + m_destination = + DestinationBuffer(dst_base, m_dimensions, dst_strides, total_dst_bytes); + } + + TensorBlockDescriptor& DropDestinationBuffer() { + m_destination.m_data = NULL; + return *this; + } + + // Returns a non-nullptr pointer to a destination buffer memory if this + // block has a contiguous destination buffer. + template + Scalar* destination() const { + if (m_destination.template fitsContiguously(*this)) { + return m_destination.template data(); + } + return NULL; + } + + private: + // Offset and dimensions are immutable after construction. Block descriptor + // can only be mutated by adding or dropping destination. + const IndexType m_offset; + const Dimensions m_dimensions; + DestinationBuffer m_destination; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockScratchAllocator is responsible for allocating temporary buffers +// for block evaluation (output or input block materialization). Given that +// Eigen expression traversal order is deterministic, all temporary allocations +// are happening in the same order, and usually have exactly the same size. +// Scratch allocator keeps a trace of all dynamic allocations, and after the +// first block evaluation is completed, we should be able to reuse all the +// temporary buffers for the next block evaluation. + +template +class TensorBlockScratchAllocator { + public: + explicit TensorBlockScratchAllocator(const Device& device) + : m_device(device), m_allocation_index(0) {} + + ~TensorBlockScratchAllocator() { + for (size_t i = 0; i < m_allocations.size(); ++i) { + m_device.deallocate(m_allocations[i].ptr); + } + } + + void* allocate(size_t size) { + // TODO(ezhulenev): Remove when replaced with inlined vector. + if (m_allocations.capacity() == 0) m_allocations.reserve(8); + + // Check if we already have an existing allocation att current index. + const int num_allocations = static_cast(m_allocations.size()); + const bool has_allocation = m_allocation_index < num_allocations; + + // Allocation index can't be larger than the number of allocations. + eigen_assert(m_allocation_index <= num_allocations); + + // If we have existing allocation, and its size is larger or equal to + // requested size, we do nothing. + + // If current allocation can't fit requested size, we deallocate it, and + // replace with a larger allocation. + if (has_allocation && m_allocations[m_allocation_index].size < size) { + m_device.deallocate(m_allocations[m_allocation_index].ptr); + m_allocations[m_allocation_index].ptr = m_device.allocate(size); + m_allocations[m_allocation_index].size = size; + } + + // Make a new allocation if we don't have and existing one. + if (!has_allocation) { + Allocation allocation; + allocation.ptr = m_device.allocate(size); + allocation.size = size; + m_allocations.push_back(allocation); + } + + eigen_assert(m_allocations[m_allocation_index].ptr != NULL); + eigen_assert(m_allocations[m_allocation_index].size >= size); + + return m_allocations[m_allocation_index++].ptr; + } + + void reset() { m_allocation_index = 0; } + + private: + struct Allocation { + void* ptr; + size_t size; + }; + + const Device& m_device; + int m_allocation_index; + // TODO(ezhulenev): This should be an inlined vector. + std::vector m_allocations; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockKind represents all possible block kinds, that can be produced by +// TensorEvaluator::evalBlock function. +#if !EIGEN_HAS_CXX11 +// To be able to use `TensorBlockKind::kExpr` in C++03 we need a namespace. +// (Use of enumeration in a nested name specifier is a c++11 extension). +namespace TensorBlockKind { +#endif +enum TensorBlockKind { + // Tensor block that is a lazy expression that must be assigned to a + // destination using TensorBlockAssign. + kExpr, + + // Tensor block that is a view into a memory buffer owned by an underlying + // Tensor expression (e.g. it can be a view into a Tensor buffer). + kView, + + // Tensor block that was materialized in a scratch memory buffer, allocated + // with TensorBlockScratchAllocator. This block must be copied to a + // destination, similar to a block of `kExpr` type. + kMaterializedInScratch, + + // Tensor block that was materialized directly into the final output memory + // buffer. For example if the left side of an assignment is a Tensor, we can + // directly materialize the block in the destination memory. The block + // expression is still a valid Tensor expression, and can be used to build + // lazy expressions. + kMaterializedInOutput + + // TODO(ezhulenev): If we know that we are evaluating a block, for the root of + // the expression tree, it might be beneficial to do an assignment to the + // output memory buffer, even if it will be impossible to construct a valid + // block expression after that (e.g. output memory buffer has strides not + // compatible with TensorMap). This might be a performance optimization for + // uniformly shaped blocks, because for blocks skewed towards inner dimension + // `kMaterializedInOutput` should always work. +}; +#if !EIGEN_HAS_CXX11 +} // namespace TensorBlockKind +#endif + +// -------------------------------------------------------------------------- // +// TensorBlockNotImplemented should be used to defined TensorBlock typedef in +// TensorEvaluators that do not support block evaluation. + +class TensorBlockNotImplemented { + public: + typedef void XprType; +}; + +// -------------------------------------------------------------------------- // +// XprScalar extracts Scalar type from the Eigen expressions (if expression type +// is not void). It's required to be able to define lazy block expression for +// argument types, that do not support block evaluation. + +template +struct XprScalar { + typedef typename XprType::Scalar type; +}; +template <> +struct XprScalar { + typedef void type; +}; + +// -------------------------------------------------------------------------- // +// TensorMaterializedBlock is a fully evaluated block of the original tensor, +// and XprType is just a TensorMap over the data. This block type is typically +// used to materialize blocks of tensor expressions, that can't be efficiently +// represented as lazy Tensor expressions with fast coeff/packet operations, +// e.g. we materialize all broadcasts into evaluated blocks. +// +// TensorMaterializedBlock does not own its memory buffer, it's either a memory +// buffer that backs the original expression (e.g. block is just a view into a +// Tensor), or a memory buffer allocated with scratch allocator, and in this +// case the scratch allocator will deallocate it at the end of block based +// expression execution. + +template +class TensorMaterializedBlock { +#if !EIGEN_HAS_CXX11 + typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; +#endif + public: + typedef TensorMap > XprType; + + TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, + const DSizes& dimensions) + : m_kind(kind), + m_data(data), + m_dimensions(dimensions), + m_expr(m_data, m_dimensions) { + eigen_assert(m_kind == TensorBlockKind::kView || + m_kind == TensorBlockKind::kMaterializedInScratch || + m_kind == TensorBlockKind::kMaterializedInOutput); + } + + TensorBlockKind kind() const { return m_kind; } + // NOTE(ezhulenev): Returning XprType by value like in other block types + // causes asan failures. The theory is that XprType::Nested doesn't work + // properly for TensorMap. + const XprType& expr() const { return m_expr; } + const Scalar* data() const { return m_data; } + + void cleanup() {} + + private: + TensorBlockKind m_kind; + const Scalar* m_data; + DSizes m_dimensions; + XprType m_expr; +}; + +// -------------------------------------------------------------------------- // +// TensorCwiseUnaryBlock is a lazy tensor expression that applies UnaryOp +// functor to the blocks produced by the underlying Tensor expression. + +template +class TensorCwiseUnaryBlock { +#if !EIGEN_HAS_CXX11 + typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; +#endif + + static const bool NoArgBlockAccess = + internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + TensorCwiseUnaryOp >::type + XprType; + + typedef typename XprScalar::type Scalar; + + TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor) + : m_arg_block(arg_block), m_functor(functor) {} + + TensorBlockKind kind() const { return TensorBlockKind::kExpr; } + + XprType expr() const { return XprType(m_arg_block.expr(), m_functor); } + const Scalar* data() const { return NULL; } + void cleanup() { m_arg_block.cleanup(); } + + private: + ArgTensorBlock m_arg_block; + UnaryOp m_functor; +}; + +// -------------------------------------------------------------------------- // +// TensorCwiseUnaryBlock is a lazy tensor expression that applies BinaryOp +// functor to the blocks produced by the underlying Tensor expression. + +template +class TensorCwiseBinaryBlock { +#if !EIGEN_HAS_CXX11 + typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; +#endif + + static const bool NoArgBlockAccess = + internal::is_void::value || + internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + TensorCwiseBinaryOp >::type + XprType; + + typedef typename XprScalar::type Scalar; + + TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, + const RhsTensorBlock& right_block, + const BinaryOp& functor) + : m_left_block(left_block), + m_right_block(right_block), + m_functor(functor) {} + + TensorBlockKind kind() const { return TensorBlockKind::kExpr; } + + XprType expr() const { + return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); + } + + const Scalar* data() const { return NULL; } + + void cleanup() { + m_left_block.cleanup(); + m_right_block.cleanup(); + } + + private: + LhsTensorBlock m_left_block; + RhsTensorBlock m_right_block; + BinaryOp m_functor; +}; + +// -------------------------------------------------------------------------- // +// StridedLinearBufferCopy provides a method to copy data between two linear +// buffers with different strides, with optimized paths for scatter/gather. + +template +class StridedLinearBufferCopy { + typedef typename packet_traits::type Packet; + enum { + Vectorizable = packet_traits::Vectorizable, + PacketSize = packet_traits::size + }; + + public: + struct Dst { + Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {} + + IndexType offset; + IndexType stride; + Scalar* data; + }; + + struct Src { + Src(IndexType o, IndexType s, const Scalar* d) + : offset(o), stride(s), data(d) {} + + IndexType offset; + IndexType stride; + const Scalar* data; + }; + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, + const Src& src, + const size_t count) { + Run(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, + src.data); + } + + private: + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const IndexType count, const IndexType dst_offset, + const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data, + const IndexType src_offset, const IndexType src_stride, + const Scalar* EIGEN_RESTRICT src_data) { + const Scalar* src = &src_data[src_offset]; + Scalar* dst = &dst_data[dst_offset]; + + if (!Vectorizable) { + for (Index i = 0; i < count; ++i) { + dst[i * dst_stride] = src[i * src_stride]; + } + return; + } + + const IndexType unrolled_size = count - 4 * PacketSize; + const IndexType vectorized_size = count - PacketSize; + IndexType i = 0; + + if (src_stride == 1 && dst_stride == 1) { + // ******************************************************************** // + // Linear copy from `src` to `dst`. + for (; i <= unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + Packet p = ploadu(src + i + j * PacketSize); + pstoreu(dst + i + j * PacketSize, p); + } + } + for (; i <= vectorized_size; i += PacketSize) { + Packet p = ploadu(src + i); + pstoreu(dst + i, p); + } + for (; i < count; ++i) { + dst[i] = src[i]; + } + // ******************************************************************** // + } else if (src_stride == 1 && dst_stride != 1) { + // Scatter from `src` to `dst`. + for (; i <= vectorized_size; i += PacketSize) { + Packet p = ploadu(src + i); + pscatter(dst + i * dst_stride, p, dst_stride); + } + for (; i < count; ++i) { + dst[i * dst_stride] = src[i]; + } + // ******************************************************************** // + } else if (src_stride == 0 && dst_stride == 1) { + // Fill `dst` with value at `*src`. + Packet p = pload1(src); + for (; i <= unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + pstoreu(dst + i + j * PacketSize, p); + } + } + for (; i <= vectorized_size; i += PacketSize) { + pstoreu(dst + i, p); + } + for (; i < count; ++i) { + dst[i] = *src; + } + // ******************************************************************** // + } else if (src_stride == 0 && dst_stride != 1) { + // Scatter `*src` into `dst`. + Packet p = pload1(src); + for (; i <= vectorized_size; i += PacketSize) { + pscatter(dst + i * dst_stride, p, dst_stride); + } + for (; i < count; ++i) { + dst[i * dst_stride] = *src; + } + // ******************************************************************** // + } else if (dst_stride == 1) { + // Gather from `src` into `dst`. + for (; i <= vectorized_size; i += PacketSize) { + Packet p = pgather(src + i * src_stride, src_stride); + pstoreu(dst + i, p); + } + for (; i < count; ++i) { + dst[i] = src[i * src_stride]; + } + // ******************************************************************** // + } else { + // Random. + for (; i < count; ++i) { + dst[i * dst_stride] = src[i * src_stride]; + } + } + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block. +// It's possible to specify src->dst dimension mapping for the copy operation. +// Dimensions of `dst` specify how many elements have to be copied, for the +// `src` we need to know only stride to navigate through source memory buffer. + +template +class TensorBlockIOV2 { + static const bool IsColMajor = (Layout == ColMajor); + + typedef StridedLinearBufferCopy LinCopy; + + public: + typedef DSizes Dimensions; + + struct Dst { + Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, + IndexType dst_offset = 0) + : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} + + Dimensions dims; + Dimensions strides; + Scalar* data; + IndexType offset; + }; + + struct Src { + Src(const Dimensions& src_strides, const Scalar* src, + IndexType src_offset = 0) + : strides(src_strides), data(src), offset(src_offset) {} + + Dimensions strides; + const Scalar* data; + IndexType offset; + }; + + // Copies data to `dst` from `src`, using provided dimensions mapping: + // + // src_dimension_index = dst_to_src_dim_map[dst_dimension_index] + // + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy( + const Dst& dst, const Src& src, const Dimensions& dst_to_src_dim_map) { + // Copy single scalar value from `src` to `dst`. + if (NumDims == 0) { + *(dst.data + dst.offset) = *(src.data + src.offset); + return; + } + + // Both `dst` and `src` must have contiguous innermost dimension. We also + // accept the special case with stride '0', because it's used as a trick to + // implement broadcasting. + { + int inner_dim = IsColMajor ? 0 : NumDims - 1; + EIGEN_UNUSED_VARIABLE(inner_dim); + eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0); + eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0); + } + + // Give a shorter name to `dst_to_src_dim_map`. + const Dimensions& dim_map = dst_to_src_dim_map; + + // Do not squeeze reordered inner dimensions. + int num_squeezable_dims = NumSqueezableInnerDims(dim_map); + + // NOTE: We find the innermost dimension (contiguous in memory) in the dst + // block, and we write data linearly into that dimension, reading it from + // the src. If dimensions are reordered, we might end up reading data from + // the src with `stride != 1`. + // + // NOTE: Random-Read/Linear-Write can be up to ~2X faster than + // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680 + + // Find the innermost dimension in the dst whose size is not 1. This is the + // effective inner dim. + IndexType num_size_one_inner_dims = 0; + for (int i = 0; i < num_squeezable_dims; ++i) { + const int dst_dim = IsColMajor ? i : NumDims - i - 1; + if (dst.dims[dst_dim] != 1) break; + num_size_one_inner_dims++; + } + + // If all dimensions are of size 1, just copy a scalar from `src` to `dst`. + if (num_size_one_inner_dims == NumDims) { + *(dst.data + dst.offset) = *(src.data + src.offset); + return; + } + + // Outermost dimension in the dst with `stride == 1` (contiguous in memory). + const IndexType dst_stride1_dim = + IsColMajor ? num_size_one_inner_dims + : NumDims - num_size_one_inner_dims - 1; + + // Dimension in the src that corresponds to the dst innermost dimension. + const IndexType src_dim_for_dst_stride1_dim = + NumDims == 0 ? 1 : dim_map[dst_stride1_dim]; + + // Size of the innermost dimension (length of contiguous blocks of memory). + IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim]; + + // Squeeze multiple inner dims into one if they are contiguous in `dst` and + // `src` memory, so we can do less linear copy calls. + for (Index i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) { + const int dst_dim = IsColMajor ? i : NumDims - i - 1; + const IndexType dst_stride = dst.strides[dst_dim]; + const IndexType src_stride = src.strides[dim_map[dst_dim]]; + if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) { + dst_inner_dim_size *= dst.dims[dst_dim]; + ++num_size_one_inner_dims; + } else { + break; + } + } + + // Setup strides to read data from `src` and write to `dst`. + IndexType input_offset = src.offset; + IndexType output_offset = dst.offset; + IndexType input_stride = + NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim]; + IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim]; + + const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; + array it; + + // Initialize block iterator state. Squeeze away any dimension of size 1. + int idx = 0; // currently initialized iterator state index + for (Index i = num_size_one_inner_dims; i < NumDims - 1; ++i) { + const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2; + if (dst.dims[dst_dim] == 1) continue; + + it[idx].size = dst.dims[dst_dim]; + it[idx].input_stride = src.strides[dim_map[dst_dim]]; + it[idx].output_stride = dst.strides[dst_dim]; + + it[idx].input_span = it[idx].input_stride * (it[idx].size - 1); + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + + idx++; + } + + // Iterate copying data from src to dst. + const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); + + for (IndexType i = 0; i < block_total_size; i += dst_inner_dim_size) { + // Copy data for the innermost dimension. + LinCopy::Run( + typename LinCopy::Dst(output_offset, output_stride, dst.data), + typename LinCopy::Src(input_offset, input_stride, src.data), + dst_inner_dim_size); + + // Update offsets (idx is the number of initialize block iterators). + for (int j = 0; j < idx; ++j) { + if (++it[j].count < it[j].size) { + input_offset += it[j].input_stride; + output_offset += it[j].output_stride; + break; + } + it[j].count = 0; + input_offset -= it[j].input_span; + output_offset -= it[j].output_span; + } + } + } + + // Copy from `src` to `dst` with an identity src->dst dimension map. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(const Dst& dst, + const Src& src) { + Dimensions dst_to_src_map; + for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i; + Copy(dst, src, dst_to_src_map); + } + + private: + struct BlockIteratorState { + BlockIteratorState() + : size(0), + count(0), + input_stride(0), + output_stride(0), + input_span(0), + output_span(0) {} + + IndexType size; + IndexType count; + IndexType input_stride; + IndexType output_stride; + IndexType input_span; + IndexType output_span; + }; + + // Compute how many inner dimensions it's allowed to squeeze when doing IO + // between two tensor blocks. It's safe to squeeze inner dimensions, only + // if they are not reordered. + static int NumSqueezableInnerDims(const Dimensions& dim_map) { + int num_squeezable_dims = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + if (dim_map[dim] != dim) break; + num_squeezable_dims++; + } + return num_squeezable_dims; + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to +// a Tensor block defined by `desc`, backed by a memory buffer at `dst` address. +// +// Currently there is no way to write from a Tensor expression to a block of +// memory, if dimensions are reordered. If you need to do that, you should +// materialize a Tensor block expression into a memory buffer, and then use +// TensorBlockIO to copy data between two memory buffers with a custom +// `dst->src` dimension map (see definition above). +// +// Also currently the innermost dimension of `dst` must have a stride '1' +// (contiguous in memory). This restriction could be lifted with a `pscatter`, +// but in practice it's never needed, and there is a similar TensorBlockIO +// workaround for that. +// +// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO +// where `src` is a tensor expression. Explore if it is possible to rewrite IO +// to use expressions instead of pointers, and after that TensorBlockAssignment +// will become an alias to IO. +template +class TensorBlockAssignment { + // We will use coeff/packet path to evaluate block expressions. + typedef TensorEvaluator + TensorBlockEvaluator; + + typedef DSizes Dimensions; + + enum { + Vectorizable = packet_traits::Vectorizable, + PacketSize = packet_traits::size + }; + + template + struct InnerDimAssign { + EIGEN_ALWAYS_INLINE static void Run(Scalar* dst, IndexType count, + const Evaluator& eval, + IndexType eval_offset) { + for (IndexType i = 0; i < count; ++i) { + dst[i] = eval.coeff(eval_offset + i); + } + } + }; + + template + struct InnerDimAssign { + EIGEN_ALWAYS_INLINE static void Run(Scalar* dst, IndexType count, + const Evaluator& eval, + IndexType eval_offset) { + typedef typename packet_traits::type Packet; + + const IndexType unrolled_size = count - 4 * PacketSize; + const IndexType vectorized_size = count - PacketSize; + IndexType i = 0; + + for (; i <= unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + const IndexType idx = eval_offset + i + j * PacketSize; + Packet p = eval.template packet(idx); + pstoreu(dst + i + j * PacketSize, p); + } + } + + for (; i <= vectorized_size; i += PacketSize) { + Packet p = eval.template packet(eval_offset + i); + pstoreu(dst + i, p); + } + + for (; i < count; ++i) { + dst[i] = eval.coeff(eval_offset + i); + } + } + }; + + public: + struct Dst { + Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, + IndexType dst_offset = 0) + : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} + + Dimensions dims; + Dimensions strides; + Scalar* data; + IndexType offset; + }; + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Dst& dst, const TensorBlockExpr& expr) { + // Prepare evaluator for block expression. + DefaultDevice default_device; + TensorBlockEvaluator eval(expr, default_device); + + // Tensor block expression dimension should match destination dimensions. + eigen_assert(dimensions_match(dst.dims, eval.dimensions())); + + static const int Layout = TensorBlockEvaluator::Layout; + static const bool is_col_major = Layout == ColMajor; + + // Initialize output inner dimension size based on a layout. + const IndexType output_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); + const int inner_dim_idx = is_col_major ? 0 : NumDims - 1; + IndexType output_inner_dim_size = dst.dims[inner_dim_idx]; + + // Dst inner dimension stride must be '1'. + eigen_assert(dst.strides[inner_dim_idx] == 1); + + // Squeeze multiple inner dims into one if they are contiguous in `dst`. + IndexType num_squeezed_dims = 0; + for (Index i = 1; i < NumDims; ++i) { + const Index dim = is_col_major ? i : NumDims - i - 1; + const IndexType dst_stride = dst.strides[dim]; + + if (output_inner_dim_size == dst_stride) { + output_inner_dim_size *= dst.dims[dim]; + num_squeezed_dims++; + } else { + break; + } + } + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + + int idx = 0; // currently initialized iterator state index + for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) { + const Index dim = is_col_major ? i + 1 : NumDims - i - 2; + + it[idx].count = 0; + it[idx].size = dst.dims[dim]; + it[idx].output_stride = dst.strides[dim]; + it[idx].output_span = it[i].output_stride * (it[i].size - 1); + idx++; + } + + // We read block expression from the beginning, and start writing data to + // `dst` at given offset. + IndexType input_offset = 0; + IndexType output_offset = dst.offset; + + // Iterate copying data from `eval` to `dst`. + for (IndexType i = 0; i < output_size; i += output_inner_dim_size) { + // Assign to `dst` at current offset. + InnerDimAssign::Run(dst.data + output_offset, + output_inner_dim_size, eval, + input_offset); + + // Move input offset forward by the number of assigned coefficients. + input_offset += output_inner_dim_size; + + // Update index. + for (int j = 0; j < idx; ++j) { + if (++it[j].count < it[j].size) { + output_offset += it[j].output_stride; + break; + } + it[j].count = 0; + output_offset -= it[j].output_span; + } + } + } + + private: + struct BlockIteratorState { + BlockIteratorState() + : count(0), size(0), output_stride(0), output_span(0) {} + + IndexType count; + IndexType size; + IndexType output_stride; + IndexType output_span; + }; +}; + +// -------------------------------------------------------------------------- // + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_V2_H diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp new file mode 100644 index 000000000..e85b81141 --- /dev/null +++ b/unsupported/test/cxx11_tensor_block_eval.cpp @@ -0,0 +1,339 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// clang-format off +#include "main.h" +#include +// clang-format on + +using Eigen::internal::TensorBlockDescriptor; +using Eigen::internal::TensorExecutor; + +// -------------------------------------------------------------------------- // +// Utility functions to generate random tensors, blocks, and evaluate them. + +template +static DSizes RandomDims(Index min, Index max) { + DSizes dims; + for (int i = 0; i < NumDims; ++i) { + dims[i] = internal::random(min, max); + } + return DSizes(dims); +} + +// Block offsets and extents allows to construct a TensorSlicingOp corresponding +// to a TensorBlockDescriptor. +template +struct TensorBlockParams { + DSizes offsets; + DSizes sizes; + TensorBlockDescriptor desc; +}; + +template +static TensorBlockParams RandomBlock(DSizes dims, + Index min, Index max) { + // Choose random offsets and sizes along all tensor dimensions. + DSizes offsets(RandomDims(min, max)); + DSizes sizes(RandomDims(min, max)); + + // Make sure that offset + size do not overflow dims. + for (int i = 0; i < NumDims; ++i) { + offsets[i] = numext::mini(dims[i] - 1, offsets[i]); + sizes[i] = numext::mini(sizes[i], dims[i] - offsets[i]); + } + + Index offset = 0; + DSizes strides = Eigen::internal::strides(dims); + for (int i = 0; i < NumDims; ++i) { + offset += strides[i] * offsets[i]; + } + + return {offsets, sizes, TensorBlockDescriptor(offset, sizes)}; +} + +// Generate block with block sizes skewed towards inner dimensions. This type of +// block is required for evaluating broadcast expressions. +template +static TensorBlockParams SkewedInnerBlock( + DSizes dims) { + using BlockMapper = internal::TensorBlockMapper; + BlockMapper block_mapper(dims, + internal::TensorBlockShapeType::kSkewedInnerDims, + internal::random(1, dims.TotalSize())); + + Index total_blocks = block_mapper.total_block_count(); + Index block_index = internal::random(0, total_blocks - 1); + auto block = block_mapper.GetBlockForIndex(block_index, nullptr); + DSizes sizes = block.block_sizes(); + + auto strides = internal::strides(dims); + DSizes offsets; + + // Compute offsets for the first block coefficient. + Index index = block.first_coeff_index(); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / strides[i]; + index -= idx * strides[i]; + offsets[i] = idx; + } + offsets[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / strides[i]; + index -= idx * strides[i]; + offsets[i] = idx; + } + offsets[NumDims - 1] = index; + } + + auto desc = TensorBlockDescriptor(block.first_coeff_index(), sizes); + return {offsets, sizes, desc}; +} + +template +static TensorBlockParams FixedSizeBlock(DSizes dims) { + DSizes offsets; + for (int i = 0; i < NumDims; ++i) offsets[i] = 0; + + return {offsets, dims, TensorBlockDescriptor(0, dims)}; +} + +// -------------------------------------------------------------------------- // +// Verify that block expression evaluation produces the same result as a +// TensorSliceOp (reading a tensor block is same to taking a tensor slice). + +template +static void VerifyBlockEvaluator(Expression expr, GenBlockParams gen_block) { + using Device = DefaultDevice; + auto d = Device(); + + // Scratch memory allocator for block evaluation. + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + TensorBlockScratch scratch(d); + + // TensorEvaluator is needed to produce tensor blocks of the expression. + auto eval = TensorEvaluator(expr, d); + + // Choose a random offsets, sizes and TensorBlockDescriptor. + TensorBlockParams block_params = gen_block(); + + // Evaluate TensorBlock expression into a tensor. + Tensor block(block_params.desc.dimensions()); + + // Maybe use this tensor as a block desc destination. + Tensor dst(block_params.desc.dimensions()); + if (internal::random()) { + block_params.desc.template AddDestinationBuffer( + dst.data(), internal::strides(dst.dimensions()), + dst.dimensions().TotalSize() * sizeof(T)); + } + + auto tensor_block = eval.blockV2(block_params.desc, scratch); + auto b_expr = tensor_block.expr(); + + // We explicitly disable vectorization and tiling, to run a simple coefficient + // wise assignment loop, because it's very simple and should be correct. + using BlockAssign = TensorAssignOp; + using BlockExecutor = TensorExecutor; + BlockExecutor::run(BlockAssign(block, b_expr), d); + + // Cleanup temporary buffers owned by a tensor block. + tensor_block.cleanup(); + + // Compute a Tensor slice corresponding to a Tensor block. + Tensor slice(block_params.desc.dimensions()); + auto s_expr = expr.slice(block_params.offsets, block_params.sizes); + + // Explicitly use coefficient assignment to evaluate slice expression. + using SliceAssign = TensorAssignOp; + using SliceExecutor = TensorExecutor; + SliceExecutor::run(SliceAssign(slice, s_expr), d); + + // Tensor block and tensor slice must be the same. + for (Index i = 0; i < block.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(block.coeff(i), slice.coeff(i)); + } +} + +// -------------------------------------------------------------------------- // + +template +static void test_eval_tensor_block() { + DSizes dims = RandomDims(10, 20); + Tensor input(dims); + input.setRandom(); + + // Identity tensor expression transformation. + VerifyBlockEvaluator( + input, [&dims]() { return RandomBlock(dims, 10, 20); }); +} + +template +static void test_eval_tensor_unary_expr_block() { + DSizes dims = RandomDims(10, 20); + Tensor input(dims); + input.setRandom(); + + VerifyBlockEvaluator( + input.square(), [&dims]() { return RandomBlock(dims, 10, 20); }); +} + +template +static void test_eval_tensor_binary_expr_block() { + DSizes dims = RandomDims(10, 20); + Tensor lhs(dims), rhs(dims); + lhs.setRandom(); + rhs.setRandom(); + + VerifyBlockEvaluator( + lhs + rhs, [&dims]() { return RandomBlock(dims, 10, 20); }); +} + +template +static void test_eval_tensor_binary_with_unary_expr_block() { + DSizes dims = RandomDims(10, 20); + Tensor lhs(dims), rhs(dims); + lhs.setRandom(); + rhs.setRandom(); + + VerifyBlockEvaluator( + (lhs.square() + rhs.square()).sqrt(), + [&dims]() { return RandomBlock(dims, 10, 20); }); +} + +template +static void test_eval_tensor_broadcast() { + DSizes dims = RandomDims(1, 10); + Tensor input(dims); + input.setRandom(); + + DSizes bcast = RandomDims(1, 5); + + DSizes bcasted_dims; + for (int i = 0; i < NumDims; ++i) bcasted_dims[i] = dims[i] * bcast[i]; + + VerifyBlockEvaluator( + input.broadcast(bcast), + [&bcasted_dims]() { return SkewedInnerBlock(bcasted_dims); }); + + VerifyBlockEvaluator( + input.broadcast(bcast), + [&bcasted_dims]() { return FixedSizeBlock(bcasted_dims); }); + + // Check that desc.destination() memory is not shared between two broadcast + // materializations. + VerifyBlockEvaluator( + input.broadcast(bcast) + input.square().broadcast(bcast), + [&bcasted_dims]() { return SkewedInnerBlock(bcasted_dims); }); +} + +// -------------------------------------------------------------------------- // +// Verify that assigning block to a Tensor expression produces the same result +// as an assignment to TensorSliceOp (writing a block is is identical to +// assigning one tensor to a slice of another tensor). + +template +static void VerifyBlockAssignment(Tensor& tensor, + Expression expr, GenBlockParams gen_block) { + using Device = DefaultDevice; + auto d = Device(); + + // We use tensor evaluator as a target for block and slice assignments. + auto eval = TensorEvaluator(expr, d); + + // Generate a random block, or choose a block that fits in full expression. + TensorBlockParams block_params = gen_block(); + + // Generate random data of the selected block size. + Tensor block(block_params.desc.dimensions()); + block.setRandom(); + + // ************************************************************************ // + // (1) Assignment from a block. + + // Construct a materialize block from a random generated block tensor. + internal::TensorMaterializedBlock blk( + internal::TensorBlockKind::kView, block.data(), block.dimensions()); + + // Reset all underlying tensor values to zero. + tensor.setZero(); + + // Use evaluator to write block into a tensor. + eval.writeBlockV2(block_params.desc, blk); + + // Make a copy of the result after assignment. + Tensor block_assigned = tensor; + + // ************************************************************************ // + // (2) Assignment to a slice + + // Reset all underlying tensor values to zero. + tensor.setZero(); + + // Assign block to a slice of original expression + auto s_expr = expr.slice(block_params.offsets, block_params.sizes); + + // Explicitly use coefficient assignment to evaluate slice expression. + using SliceAssign = TensorAssignOp; + using SliceExecutor = TensorExecutor; + SliceExecutor::run(SliceAssign(s_expr, block), d); + + // Make a copy of the result after assignment. + Tensor slice_assigned = tensor; + + for (Index i = 0; i < tensor.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(block_assigned.coeff(i), slice_assigned.coeff(i)); + } +} + +// -------------------------------------------------------------------------- // + +template +static void test_assign_tensor_block() { + DSizes dims = RandomDims(10, 20); + Tensor tensor(dims); + + TensorMap> map(tensor.data(), dims); + + VerifyBlockAssignment( + tensor, map, [&dims]() { return RandomBlock(dims, 10, 20); }); + VerifyBlockAssignment( + tensor, map, [&dims]() { return FixedSizeBlock(dims); }); +} + +// -------------------------------------------------------------------------- // + +//#define CALL_SUBTESTS(NAME) CALL_SUBTEST((NAME())) + +#define CALL_SUBTESTS(NAME) \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())) + +EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) { + // clang-format off + CALL_SUBTESTS(test_eval_tensor_block); + CALL_SUBTESTS(test_eval_tensor_unary_expr_block); + CALL_SUBTESTS(test_eval_tensor_binary_expr_block); + CALL_SUBTESTS(test_eval_tensor_binary_with_unary_expr_block); + CALL_SUBTESTS(test_eval_tensor_broadcast); + + CALL_SUBTESTS(test_assign_tensor_block); + // clang-format on +} diff --git a/unsupported/test/cxx11_tensor_block_io.cpp b/unsupported/test/cxx11_tensor_block_io.cpp new file mode 100644 index 000000000..8a03c7dd4 --- /dev/null +++ b/unsupported/test/cxx11_tensor_block_io.cpp @@ -0,0 +1,438 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// clang-format off +#include "main.h" +#include +// clang-format on + +// -------------------------------------------------------------------------- // +// A set of tests for TensorBlockIO: copying data between tensor blocks. + +template +static DSizes RandomDims(Index min, Index max) { + DSizes dims; + for (int i = 0; i < NumDims; ++i) { + dims[i] = internal::random(min, max); + } + return DSizes(dims); +} + +static internal::TensorBlockShapeType RandomBlockShape() { + return internal::random() ? internal::kUniformAllDims + : internal::kSkewedInnerDims; +} + +template +static Index RandomTargetBlockSize(const DSizes& dims) { + return internal::random(1, dims.TotalSize()); +} + +template +static Index GetInputIndex(Index output_index, + const array& output_to_input_dim_map, + const array& input_strides, + const array& output_strides) { + int input_index = 0; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = output_index / output_strides[i]; + input_index += idx * input_strides[output_to_input_dim_map[i]]; + output_index -= idx * output_strides[i]; + } + return input_index + + output_index * input_strides[output_to_input_dim_map[0]]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = output_index / output_strides[i]; + input_index += idx * input_strides[output_to_input_dim_map[i]]; + output_index -= idx * output_strides[i]; + } + return input_index + + output_index * input_strides[output_to_input_dim_map[NumDims - 1]]; + } +} + +template +static void test_block_io_copy_data_from_source_to_target() { + using TensorBlockIO = internal::TensorBlockIOV2; + using IODst = typename TensorBlockIO::Dst; + using IOSrc = typename TensorBlockIO::Src; + + // Generate a random input Tensor. + DSizes dims = RandomDims(1, 30); + Tensor input(dims); + input.setRandom(); + + // Write data to an output Tensor. + Tensor output(dims); + + // Construct a tensor block mapper. + using TensorBlockMapper = + internal::TensorBlockMapper; + TensorBlockMapper block_mapper(dims, RandomBlockShape(), + RandomTargetBlockSize(dims)); + + // We will copy data from input to output through this buffer. + Tensor block(block_mapper.block_dim_sizes()); + + // Precompute strides for TensorBlockIO::Copy. + auto input_strides = internal::strides(dims); + auto output_strides = internal::strides(dims); + + const T* input_data = input.data(); + T* output_data = output.data(); + T* block_data = block.data(); + + for (int i = 0; i < block_mapper.total_block_count(); ++i) { + using TensorBlock = internal::TensorBlock; + TensorBlock blk = block_mapper.GetBlockForIndex(i, block_data); + + auto blk_dims = blk.block_sizes(); + auto blk_strides = internal::strides(blk_dims); + + { + // Read from input into a block buffer. + IODst dst(blk_dims, blk_strides, block_data, 0); + IOSrc src(input_strides, input_data, blk.first_coeff_index()); + + TensorBlockIO::Copy(dst, src); + } + + { + // Write from block buffer to output. + IODst dst(blk_dims, output_strides, output_data, blk.first_coeff_index()); + IOSrc src(blk_strides, block_data, 0); + + TensorBlockIO::Copy(dst, src); + } + } + + for (int i = 0; i < dims.TotalSize(); ++i) { + VERIFY_IS_EQUAL(input_data[i], output_data[i]); + } +} + +template +static void test_block_io_copy_using_reordered_dimensions() { + // Generate a random input Tensor. + DSizes dims = RandomDims(1, 30); + Tensor input(dims); + input.setRandom(); + + // Create a random dimension re-ordering/shuffle. + std::vector shuffle; + + for (int i = 0; i < NumDims; ++i) shuffle.push_back(i); + std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937(g_seed)); + + DSizes output_tensor_dims; + DSizes input_to_output_dim_map; + DSizes output_to_input_dim_map; + for (Index i = 0; i < NumDims; ++i) { + output_tensor_dims[shuffle[i]] = dims[i]; + input_to_output_dim_map[i] = shuffle[i]; + output_to_input_dim_map[shuffle[i]] = i; + } + + // Write data to an output Tensor. + Tensor output(output_tensor_dims); + + // Construct a tensor block mapper. + // NOTE: Tensor block mapper works with shuffled dimensions. + using TensorBlockMapper = + internal::TensorBlockMapper; + TensorBlockMapper block_mapper(output_tensor_dims, RandomBlockShape(), + RandomTargetBlockSize(output_tensor_dims)); + + // We will copy data from input to output through this buffer. + Tensor block(block_mapper.block_dim_sizes()); + + // Precompute strides for TensorBlockIO::Copy. + auto input_strides = internal::strides(dims); + auto output_strides = internal::strides(output_tensor_dims); + + const T* input_data = input.data(); + T* output_data = output.data(); + T* block_data = block.data(); + + for (Index i = 0; i < block_mapper.total_block_count(); ++i) { + using TensorBlock = internal::TensorBlock; + TensorBlock blk = block_mapper.GetBlockForIndex(i, block_data); + + const Index first_coeff_index = GetInputIndex( + blk.first_coeff_index(), output_to_input_dim_map, input_strides, + output_strides); + + // NOTE: Block dimensions are in the same order as output dimensions. + + using TensorBlockIO = internal::TensorBlockIOV2; + using IODst = typename TensorBlockIO::Dst; + using IOSrc = typename TensorBlockIO::Src; + + auto blk_dims = blk.block_sizes(); + auto blk_strides = internal::strides(blk_dims); + + { + // Read from input into a block buffer. + IODst dst(blk_dims, blk_strides, block_data, 0); + IOSrc src(input_strides, input_data, first_coeff_index); + + TensorBlockIO::Copy(dst, src, + /*dst_to_src_dim_map=*/output_to_input_dim_map); + } + + { + // We need to convert block dimensions from output to input order. + auto dst_dims = blk_dims; + for (int out_dim = 0; out_dim < NumDims; ++out_dim) { + dst_dims[output_to_input_dim_map[out_dim]] = blk_dims[out_dim]; + } + + // Write from block buffer to output. + IODst dst(dst_dims, input_strides, output_data, first_coeff_index); + IOSrc src(blk_strides, block_data, 0); + + TensorBlockIO::Copy(dst, src, + /*dst_to_src_dim_map=*/input_to_output_dim_map); + } + } + + for (Index i = 0; i < dims.TotalSize(); ++i) { + VERIFY_IS_EQUAL(input_data[i], output_data[i]); + } +} + +// This is the special case for reading data with reordering, when dimensions +// before/after reordering are the same. Squeezing reads along inner dimensions +// in this case is illegal, because we reorder innermost dimension. +template +static void test_block_io_copy_using_reordered_dimensions_do_not_squeeze() { + DSizes tensor_dims(7, 9, 7); + DSizes block_dims = tensor_dims; + + DSizes block_to_tensor_dim; + block_to_tensor_dim[0] = 2; + block_to_tensor_dim[1] = 1; + block_to_tensor_dim[2] = 0; + + auto tensor_strides = internal::strides(tensor_dims); + auto block_strides = internal::strides(block_dims); + + Tensor block(block_dims); + Tensor tensor(tensor_dims); + tensor.setRandom(); + + float* tensor_data = tensor.data(); + float* block_data = block.data(); + + typedef internal::TensorBlock TensorBlock; + TensorBlock blk(0, block_dims, block_strides, tensor_strides, block_data); + + using TensorBlockIO = internal::TensorBlockIOV2; + using IODst = typename TensorBlockIO::Dst; + using IOSrc = typename TensorBlockIO::Src; + + // Read from a tensor into a block. + IODst dst(blk.block_sizes(), block_strides, block_data, 0); + IOSrc src(tensor_strides, tensor_data, blk.first_coeff_index()); + + TensorBlockIO::Copy(dst, src, + /*dst_to_src_dim_map=*/block_to_tensor_dim); + + TensorMap > block_tensor(block_data, block_dims); + TensorMap > tensor_tensor(tensor_data, tensor_dims); + + for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) { + for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) { + for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) { + float block_value = block_tensor(d2, d1, d0); + float tensor_value = tensor_tensor(d0, d1, d2); + VERIFY_IS_EQUAL(block_value, tensor_value); + } + } + } +} + +// This is the special case for reading data with reordering, when dimensions +// before/after reordering are the same. Squeezing reads in this case is allowed +// because we reorder outer dimensions. +template +static void test_block_io_copy_using_reordered_dimensions_squeeze() { + DSizes tensor_dims(7, 5, 9, 9); + DSizes block_dims = tensor_dims; + + DSizes block_to_tensor_dim; + block_to_tensor_dim[0] = 0; + block_to_tensor_dim[1] = 1; + block_to_tensor_dim[2] = 3; + block_to_tensor_dim[3] = 2; + + auto tensor_strides = internal::strides(tensor_dims); + auto block_strides = internal::strides(block_dims); + + Tensor block(block_dims); + Tensor tensor(tensor_dims); + tensor.setRandom(); + + float* tensor_data = tensor.data(); + float* block_data = block.data(); + + typedef internal::TensorBlock TensorBlock; + TensorBlock blk(0, block_dims, block_strides, tensor_strides, block_data); + + using TensorBlockIO = internal::TensorBlockIOV2; + using IODst = typename TensorBlockIO::Dst; + using IOSrc = typename TensorBlockIO::Src; + + // Read from a tensor into a block. + IODst dst(blk.block_sizes(), block_strides, block_data, 0); + IOSrc src(tensor_strides, tensor_data, blk.first_coeff_index()); + + TensorBlockIO::Copy(dst, src, + /*dst_to_src_dim_map=*/block_to_tensor_dim); + + TensorMap > block_tensor(block_data, block_dims); + TensorMap > tensor_tensor(tensor_data, tensor_dims); + + for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) { + for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) { + for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) { + for (Index d3 = 0; d3 < tensor_dims[3]; ++d3) { + float block_value = block_tensor(d0, d1, d3, d2); + float tensor_value = tensor_tensor(d0, d1, d2, d3); + VERIFY_IS_EQUAL(block_value, tensor_value); + } + } + } + } +} + +template +static void test_block_io_zero_stride() { + DSizes rnd_dims = RandomDims<5>(1, 30); + + DSizes input_tensor_dims = rnd_dims; + input_tensor_dims[0] = 1; + input_tensor_dims[2] = 1; + input_tensor_dims[4] = 1; + + Tensor input(input_tensor_dims); + input.setRandom(); + + DSizes output_tensor_dims = rnd_dims; + + auto input_tensor_strides = internal::strides(input_tensor_dims); + auto output_tensor_strides = internal::strides(output_tensor_dims); + + auto input_tensor_strides_with_zeros = input_tensor_strides; + input_tensor_strides_with_zeros[0] = 0; + input_tensor_strides_with_zeros[2] = 0; + input_tensor_strides_with_zeros[4] = 0; + + Tensor output(output_tensor_dims); + output.setRandom(); + + using TensorBlockIO = internal::TensorBlockIOV2; + using IODst = typename TensorBlockIO::Dst; + using IOSrc = typename TensorBlockIO::Src; + + // Write data from input to output with broadcasting in dims [0, 2, 4]. + IODst dst(output_tensor_dims, output_tensor_strides, output.data(), 0); + IOSrc src(input_tensor_strides_with_zeros, input.data(), 0); + TensorBlockIO::Copy(dst, src); + + for (int i = 0; i < output_tensor_dims[0]; ++i) { + for (int j = 0; j < output_tensor_dims[1]; ++j) { + for (int k = 0; k < output_tensor_dims[2]; ++k) { + for (int l = 0; l < output_tensor_dims[3]; ++l) { + for (int m = 0; m < output_tensor_dims[4]; ++m) { + float input_value = input(0, j, 0, l, 0); + float output_value = output(i, j, k, l, m); + VERIFY_IS_EQUAL(input_value, output_value); + } + } + } + } + } +} + +template +static void test_block_io_squeeze_ones() { + using TensorBlockIO = internal::TensorBlockIOV2; + using IODst = typename TensorBlockIO::Dst; + using IOSrc = typename TensorBlockIO::Src; + + // Total size > 1. + { + DSizes block_sizes(1, 2, 1, 2, 1); + auto strides = internal::strides(block_sizes); + + // Create a random input tensor. + Tensor input(block_sizes); + input.setRandom(); + + Tensor output(block_sizes); + + IODst dst(block_sizes, strides, output.data(), 0); + IOSrc src(strides, input.data()); + TensorBlockIO::Copy(dst, src); + + for (Index i = 0; i < block_sizes.TotalSize(); ++i) { + VERIFY_IS_EQUAL(output.data()[i], input.data()[i]); + } + } + + // Total size == 1. + { + DSizes block_sizes(1, 1, 1, 1, 1); + auto strides = internal::strides(block_sizes); + + // Create a random input tensor. + Tensor input(block_sizes); + input.setRandom(); + + Tensor output(block_sizes); + + IODst dst(block_sizes, strides, output.data(), 0); + IOSrc src(strides, input.data()); + TensorBlockIO::Copy(dst, src); + + for (Index i = 0; i < block_sizes.TotalSize(); ++i) { + VERIFY_IS_EQUAL(output.data()[i], input.data()[i]); + } + } +} + +#define CALL_SUBTESTS(NAME) \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())) + +EIGEN_DECLARE_TEST(cxx11_tensor_block_io) { + // clang-format off + CALL_SUBTESTS(test_block_io_copy_data_from_source_to_target); + CALL_SUBTESTS(test_block_io_copy_using_reordered_dimensions); + + CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_do_not_squeeze()); + CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_do_not_squeeze()); + + CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_squeeze()); + CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_squeeze()); + + CALL_SUBTEST(test_block_io_zero_stride()); + CALL_SUBTEST(test_block_io_zero_stride()); + + CALL_SUBTEST(test_block_io_squeeze_ones()); + CALL_SUBTEST(test_block_io_squeeze_ones()); + // clang-format on +} \ No newline at end of file -- cgit v1.2.3