From 6913221c43c6ad41b1fbfc0d263d2764abd11ad2 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 25 Jul 2018 13:51:10 -0700 Subject: Add tiled evaluation support to TensorExecutor --- unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 278689915..7ff0d323b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -1,4 +1,5 @@ // This file is part of Eigen, a lightweight C++ template library +// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2014 Benoit Steiner @@ -110,6 +111,7 @@ struct TensorEvaluator, Device> enum { IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = false }; -- cgit v1.2.3 From 966c2a7bb62a8b5b9ecd349730ffcd3b5719837d Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 27 Jul 2018 12:45:17 -0700 Subject: Rename Index to StorageIndex + use Eigen::Array and Eigen::Map when possible --- unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 418 ++++++++------------- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 1 - .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 9 +- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 185 ++++----- unsupported/test/cxx11_tensor_block_access.cpp | 292 ++++++++------ unsupported/test/cxx11_tensor_executor.cpp | 20 +- 6 files changed, 447 insertions(+), 478 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 5321acecf..84cf6d216 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -67,21 +67,21 @@ enum class TensorBlockShapeType { struct TensorOpResourceRequirements { TensorBlockShapeType block_shape; - std::size_t block_total_size; + Index block_total_size; // TODO(andydavis) Add 'target_num_threads' to support communication of // thread-resource requirements. This will allow ops deep in the // expression tree (like reductions) to communicate resources // requirements based on local state (like the total number of reductions // to be computed). TensorOpResourceRequirements(internal::TensorBlockShapeType shape, - const std::size_t size) + const Index size) : block_shape(shape), block_total_size(size) {} }; // Tries to merge multiple resource requirements. EIGEN_STRONG_INLINE void MergeResourceRequirements( const std::vector& resources, - TensorBlockShapeType* block_shape, std::size_t* block_total_size) { + TensorBlockShapeType* block_shape, Index* block_total_size) { if (resources.empty()) { return; } @@ -108,12 +108,12 @@ EIGEN_STRONG_INLINE void MergeResourceRequirements( * This class represents a tensor block specified by the index of the * first block coefficient, and the size of the block in each dimension. */ -template +template class TensorBlock { public: - typedef DSizes Dimensions; + typedef DSizes Dimensions; - TensorBlock(const Index first_coeff_index, const Dimensions& block_sizes, + TensorBlock(const StorageIndex first_coeff_index, const Dimensions& block_sizes, const Dimensions& block_strides, const Dimensions& tensor_strides, Scalar* data) : m_first_coeff_index(first_coeff_index), @@ -122,7 +122,7 @@ class TensorBlock { m_tensor_strides(tensor_strides), m_data(data) {} - Index first_coeff_index() const { return m_first_coeff_index; } + StorageIndex first_coeff_index() const { return m_first_coeff_index; } const Dimensions& block_sizes() const { return m_block_sizes; } @@ -135,108 +135,33 @@ class TensorBlock { const Scalar* data() const { return m_data; } private: - Index m_first_coeff_index; + StorageIndex m_first_coeff_index; Dimensions m_block_sizes; Dimensions m_block_strides; Dimensions m_tensor_strides; Scalar* m_data; // Not owned. }; -template +template struct TensorBlockCopyOp { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const Index num_coeff_to_copy, const Index dst_index, - const Index dst_stride, Scalar* EIGEN_RESTRICT dst_data, - const Index src_index, const Index src_stride, + const StorageIndex num_coeff_to_copy, const StorageIndex dst_index, + const StorageIndex dst_stride, Scalar* EIGEN_RESTRICT dst_data, + const StorageIndex src_index, const StorageIndex src_stride, const Scalar* EIGEN_RESTRICT src_data) { - for (Index i = 0; i < num_coeff_to_copy; ++i) { - dst_data[dst_index + i * dst_stride] = - src_data[src_index + i * src_stride]; - } - } -}; + const Scalar* src_base = &src_data[src_index]; + Scalar* dst_base = &dst_data[dst_index]; -// NOTE: Benchmarks run on an implementation of this that broke each of the -// loops in these conditionals into it's own template specialization (to -// avoid conditionals in the caller's loop) did not show an improvement. -template -struct TensorBlockCopyOp { - typedef typename packet_traits::type Packet; - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const Index num_coeff_to_copy, const Index dst_index, - const Index dst_stride, Scalar* EIGEN_RESTRICT dst_data, - const Index src_index, const Index src_stride, - const Scalar* EIGEN_RESTRICT src_data) { - if (src_stride == 1) { - const Index packet_size = internal::unpacket_traits::size; - const Index vectorized_size = - (num_coeff_to_copy / packet_size) * packet_size; - if (dst_stride == 1) { - // LINEAR - for (Index i = 0; i < vectorized_size; i += packet_size) { - Packet p = internal::ploadu(src_data + src_index + i); - internal::pstoreu(dst_data + dst_index + i, p); - } - for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { - dst_data[dst_index + i] = src_data[src_index + i]; - } - } else { - // SCATTER - for (Index i = 0; i < vectorized_size; i += packet_size) { - Packet p = internal::ploadu(src_data + src_index + i); - internal::pscatter( - dst_data + dst_index + i * dst_stride, p, dst_stride); - } - for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { - dst_data[dst_index + i * dst_stride] = src_data[src_index + i]; - } - } - } else if (src_stride == 0) { - const Index packet_size = internal::unpacket_traits::size; - const Index vectorized_size = - (num_coeff_to_copy / packet_size) * packet_size; - if (dst_stride == 1) { - // LINEAR - for (Index i = 0; i < vectorized_size; i += packet_size) { - Packet p = internal::pload1(src_data + src_index); - internal::pstoreu(dst_data + dst_index + i, p); - } - for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { - dst_data[dst_index + i] = src_data[src_index]; - } - } else { - // SCATTER - for (Index i = 0; i < vectorized_size; i += packet_size) { - Packet p = internal::pload1(src_data + src_index); - internal::pscatter( - dst_data + dst_index + i * dst_stride, p, dst_stride); - } - for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { - dst_data[dst_index + i * dst_stride] = src_data[src_index]; - } - } - } else { - if (dst_stride == 1) { - // GATHER - const Index packet_size = internal::unpacket_traits::size; - const Index vectorized_size = - (num_coeff_to_copy / packet_size) * packet_size; - for (Index i = 0; i < vectorized_size; i += packet_size) { - Packet p = internal::pgather( - src_data + src_index + i * src_stride, src_stride); - internal::pstoreu(dst_data + dst_index + i, p); - } - for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { - dst_data[dst_index + i] = src_data[src_index + i * src_stride]; - } - } else { - // RANDOM - for (Index i = 0; i < num_coeff_to_copy; ++i) { - dst_data[dst_index + i * dst_stride] = - src_data[src_index + i * src_stride]; - } - } - } + using Src = const Eigen::Array; + using Dst = Eigen::Array; + + using SrcMap = Eigen::Map>; + using DstMap = Eigen::Map>; + + const SrcMap src(src_base, num_coeff_to_copy, InnerStride<>(src_stride)); + DstMap dst(dst_base, num_coeff_to_copy, InnerStride<>(dst_stride)); + + dst = src; } }; @@ -249,34 +174,34 @@ struct TensorBlockCopyOp { * This class is responsible for copying data between a tensor and a tensor * block. */ -template +template class TensorBlockIO { public: - typedef typename internal::TensorBlock + typedef typename internal::TensorBlock TensorBlock; - typedef typename internal::TensorBlockCopyOp + typedef typename internal::TensorBlockCopyOp TensorBlockCopyOp; protected: struct BlockIteratorState { - Index input_stride; - Index output_stride; - Index input_span; - Index output_span; - Index size; - Index count; + StorageIndex input_stride; + StorageIndex output_stride; + StorageIndex input_span; + StorageIndex output_span; + StorageIndex size; + StorageIndex count; }; static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy( - const TensorBlock& block, Index first_coeff_index, - const array& tensor_to_block_dim_map, - const array& tensor_strides, const Scalar* src_data, + const TensorBlock& block, StorageIndex first_coeff_index, + const array& tensor_to_block_dim_map, + const array& tensor_strides, const Scalar* src_data, Scalar* dst_data) { // Find the innermost tensor dimension whose size is not 1. This is the // effective inner dim. If all dimensions are of size 1, then fallback to // using the actual innermost dim to avoid out-of-bound access. - Index num_size_one_inner_dims = 0; + StorageIndex num_size_one_inner_dims = 0; for (int i = 0; i < NumDims; ++i) { const int dim = cond()(i, NumDims - i - 1); if (block.block_sizes()[tensor_to_block_dim_map[dim]] != 1) { @@ -285,16 +210,16 @@ class TensorBlockIO { } } // Calculate strides and dimensions. - const Index tensor_stride1_dim = cond()( + const StorageIndex tensor_stride1_dim = cond()( num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1); - const Index block_dim_for_tensor_stride1_dim = + const StorageIndex block_dim_for_tensor_stride1_dim = NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim]; size_t block_inner_dim_size = NumDims == 0 ? 1 : block.block_sizes()[block_dim_for_tensor_stride1_dim]; for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) { const int dim = cond()(i, NumDims - i - 1); - const Index block_stride = + const StorageIndex block_stride = block.block_strides()[tensor_to_block_dim_map[dim]]; if (block_inner_dim_size == block_stride && block_stride == tensor_strides[dim]) { @@ -306,10 +231,10 @@ class TensorBlockIO { } } - Index inputIndex; - Index outputIndex; - Index input_stride; - Index output_stride; + StorageIndex inputIndex; + StorageIndex outputIndex; + StorageIndex input_stride; + StorageIndex output_stride; // Setup strides to read/write along the tensor's stride1 dimension. if (BlockRead) { @@ -337,7 +262,7 @@ class TensorBlockIO { int num_squeezed_dims = 0; for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { const int dim = cond()(i + 1, NumDims - i - 2); - const Index size = block.block_sizes()[tensor_to_block_dim_map[dim]]; + const StorageIndex size = block.block_sizes()[tensor_to_block_dim_map[dim]]; if (size == 1) { continue; } @@ -362,9 +287,9 @@ class TensorBlockIO { } // Iterate copying data from src to dst. - const Index block_total_size = + const StorageIndex block_total_size = NumDims == 0 ? 1 : block.block_sizes().TotalSize(); - for (Index i = 0; i < block_total_size; i += block_inner_dim_size) { + for (StorageIndex i = 0; i < block_total_size; i += block_inner_dim_size) { TensorBlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride, dst_data, inputIndex, input_stride, src_data); // Update index. @@ -391,19 +316,18 @@ class TensorBlockIO { * This class is responsible for reading a tensor block. * */ -template -class TensorBlockReader - : public TensorBlockIO { +template +class TensorBlockReader : public TensorBlockIO { public: - typedef typename internal::TensorBlock + typedef typename internal::TensorBlock TensorBlock; - typedef TensorBlockIO + typedef TensorBlockIO Base; static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( TensorBlock* block, const Scalar* src_data) { - array tensor_to_block_dim_map; + array tensor_to_block_dim_map; for (int i = 0; i < NumDims; ++i) { tensor_to_block_dim_map[i] = i; } @@ -412,9 +336,9 @@ class TensorBlockReader } static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - TensorBlock* block, Index first_coeff_index, - const array& tensor_to_block_dim_map, - const array& tensor_strides, const Scalar* src_data) { + TensorBlock* block, StorageIndex first_coeff_index, + const array& tensor_to_block_dim_map, + const array& tensor_strides, const Scalar* src_data) { Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map, tensor_strides, src_data, block->data()); } @@ -429,19 +353,18 @@ class TensorBlockReader * This class is responsible for writing a tensor block. * */ -template -class TensorBlockWriter : public TensorBlockIO { +template +class TensorBlockWriter : public TensorBlockIO { public: - typedef typename internal::TensorBlock + typedef typename internal::TensorBlock TensorBlock; - typedef TensorBlockIO + typedef TensorBlockIO Base; static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( const TensorBlock& block, Scalar* dst_data) { - array tensor_to_block_dim_map; + array tensor_to_block_dim_map; for (int i = 0; i < NumDims; ++i) { tensor_to_block_dim_map[i] = i; } @@ -450,9 +373,9 @@ class TensorBlockWriter : public TensorBlockIO& tensor_to_block_dim_map, - const array& tensor_strides, Scalar* dst_data) { + const TensorBlock& block, StorageIndex first_coeff_index, + const array& tensor_to_block_dim_map, + const array& tensor_strides, Scalar* dst_data) { Base::Copy(block, first_coeff_index, tensor_to_block_dim_map, tensor_strides, block.data(), dst_data); } @@ -468,67 +391,34 @@ class TensorBlockWriter : public TensorBlockIO struct TensorBlockCwiseBinaryOp { - template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const BinaryFunctor& functor, const Index num_coeff, - const Index output_index, const Index output_stride, - OutputScalar* output_data, const Index left_index, - const Index left_stride, const LeftScalar* left_data, - const Index right_index, const Index right_stride, + const BinaryFunctor& functor, const StorageIndex num_coeff, + const StorageIndex output_index, const StorageIndex output_stride, + OutputScalar* output_data, const StorageIndex left_index, + const StorageIndex left_stride, const LeftScalar* left_data, + const StorageIndex right_index, const StorageIndex right_stride, const RightScalar* right_data) { - for (Index i = 0; i < num_coeff; ++i) { - output_data[output_index + i * output_stride] = - functor(left_data[left_index + i * left_stride], - right_data[right_index + i * right_stride]); - } - } -}; + using Lhs = const Eigen::Array; + using Rhs = const Eigen::Array; + using Out = Eigen::Array; -template <> -struct TensorBlockCwiseBinaryOp { - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const BinaryFunctor& functor, const Index num_coeff, - const Index output_index, const Index output_stride, - OutputScalar* output_data, const Index left_index, - const Index left_stride, const LeftScalar* left_data, - const Index right_index, const Index right_stride, - const RightScalar* right_data) { - EIGEN_STATIC_ASSERT(functor_traits::PacketAccess, - YOU_MADE_A_PROGRAMMING_MISTAKE); - typedef typename packet_traits::type OutputPacket; - typedef typename packet_traits::type LeftPacket; - typedef typename packet_traits::type RightPacket; - const Index packet_size = unpacket_traits::size; - EIGEN_STATIC_ASSERT(unpacket_traits::size == packet_size, - YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT(unpacket_traits::size == packet_size, - YOU_MADE_A_PROGRAMMING_MISTAKE); - const Index vectorized_size = (num_coeff / packet_size) * packet_size; - if (output_stride != 1 || left_stride != 1 || right_stride != 1) { - TensorBlockCwiseBinaryOp::Run( - functor, num_coeff, output_index, output_stride, output_data, - left_index, left_stride, left_data, right_index, right_stride, - right_data); - return; - } - // Vectorization for the most common case. - for (Index i = 0; i < vectorized_size; i += packet_size) { - LeftPacket l = internal::ploadu(left_data + left_index + i); - RightPacket r = - internal::ploadu(right_data + right_index + i); - OutputPacket p = functor.packetOp(l, r); - internal::pstoreu( - output_data + output_index + i, p); - } - for (Index i = vectorized_size; i < num_coeff; ++i) { - output_data[output_index + i] = - functor(left_data[left_index + i], right_data[right_index + i]); - } + using LhsMap = Eigen::Map>; + using RhsMap = Eigen::Map>; + using OutMap = Eigen::Map>; + + const LeftScalar* lhs_base = &left_data[left_index]; + const RightScalar* rhs_base = &right_data[right_index]; + OutputScalar* out_base = &output_data[output_index]; + + const LhsMap lhs(lhs_base, num_coeff, InnerStride<>(left_stride)); + const RhsMap rhs(rhs_base, num_coeff, InnerStride<>(right_stride)); + OutMap out(out_base, num_coeff, InnerStride<>(output_stride)); + + out = + Eigen::CwiseBinaryOp(lhs, rhs, functor); } }; @@ -541,28 +431,26 @@ struct TensorBlockCwiseBinaryOp { * This class carries out the binary op on given blocks. * */ -template struct TensorBlockCwiseBinaryIO { - typedef typename internal::TensorBlock::Dimensions Dimensions; - typedef internal::TensorBlockCwiseBinaryOp< - functor_traits::PacketAccess> - TensorBlockCwiseBinaryOp; struct BlockIteratorState { - Index output_stride, output_span; - Index left_stride, left_span; - Index right_stride, right_span; - Index size, count; + StorageIndex output_stride, output_span; + StorageIndex left_stride, left_span; + StorageIndex right_stride, right_span; + StorageIndex size, count; }; template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( const BinaryFunctor& functor, const Dimensions& block_sizes, const Dimensions& block_strides, OutputScalar* output_data, - const array& left_strides, const LeftScalar* left_data, - const array& right_strides, + const array& left_strides, + const LeftScalar* left_data, + const array& right_strides, const RightScalar* right_data) { // Find the innermost dimension whose size is not 1. This is the effective // inner dim. If all dimensions are of size 1, fallback to using the actual @@ -580,7 +468,7 @@ struct TensorBlockCwiseBinaryIO { NumDims == 0 ? 1 : cond()(num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1); - Index inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim]; + StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim]; for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) { const int dim = cond()(i, NumDims - i - 1); // Merge multiple inner dims into one for larger inner dim size (i.e. @@ -595,10 +483,12 @@ struct TensorBlockCwiseBinaryIO { } } - Index output_index = 0, left_index = 0, right_index = 0; - const Index output_stride = NumDims == 0 ? 1 : block_strides[inner_dim]; - const Index left_stride = NumDims == 0 ? 1 : left_strides[inner_dim]; - const Index right_stride = NumDims == 0 ? 1 : right_strides[inner_dim]; + StorageIndex output_index = 0, left_index = 0, right_index = 0; + const StorageIndex output_stride = + NumDims == 0 ? 1 : block_strides[inner_dim]; + const StorageIndex left_stride = NumDims == 0 ? 1 : left_strides[inner_dim]; + const StorageIndex right_stride = + NumDims == 0 ? 1 : right_strides[inner_dim]; const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; array block_iter_state; @@ -607,7 +497,7 @@ struct TensorBlockCwiseBinaryIO { int num_squeezed_dims = 0; for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { const int dim = cond()(i + 1, NumDims - i - 2); - const Index size = block_sizes[dim]; + const StorageIndex size = block_sizes[dim]; if (size == 1) { continue; } @@ -624,8 +514,9 @@ struct TensorBlockCwiseBinaryIO { } // Compute cwise binary op. - const Index block_total_size = NumDims == 0 ? 1 : block_sizes.TotalSize(); - for (Index i = 0; i < block_total_size; i += inner_dim_size) { + const StorageIndex block_total_size = + NumDims == 0 ? 1 : block_sizes.TotalSize(); + for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) { TensorBlockCwiseBinaryOp::Run(functor, inner_dim_size, output_index, output_stride, output_data, left_index, left_stride, left_data, right_index, @@ -661,10 +552,10 @@ struct TensorBlockCwiseBinaryIO { template struct TensorBlockView { typedef TensorEvaluator Impl; - typedef typename Impl::Index Index; + typedef typename Impl::Index StorageIndex; typedef typename remove_const::type Scalar; static const int NumDims = array_size::value; - typedef DSizes Dimensions; + typedef DSizes Dimensions; // Constructs a TensorBlockView for `impl`. `block` is only used for for // specifying the start offset, shape, and strides of the block. @@ -701,7 +592,7 @@ struct TensorBlockView { } } } - TensorBlock input_block( + TensorBlock input_block( block.first_coeff_index(), m_block_sizes, m_block_strides, block.tensor_strides(), m_allocated_data); impl.block(&input_block); @@ -733,21 +624,21 @@ struct TensorBlockView { * * This class is responsible for iterating over the blocks of a tensor. */ -template +template class TensorBlockMapper { public: - typedef typename internal::TensorBlock + typedef typename internal::TensorBlock TensorBlock; - typedef DSizes Dimensions; + typedef DSizes Dimensions; TensorBlockMapper(const Dimensions& dims, const TensorBlockShapeType block_shape, - size_t min_target_size) + Index min_target_size) : m_dimensions(dims), m_block_dim_sizes(BlockDimensions(dims, block_shape, min_target_size)) { // Calculate block counts by dimension and total block count. - DSizes block_count; - for (size_t i = 0; i < block_count.rank(); ++i) { + DSizes block_count; + for (Index i = 0; i < block_count.rank(); ++i) { block_count[i] = divup(m_dimensions[i], m_block_dim_sizes[i]); } m_total_block_count = array_prod(block_count); @@ -773,15 +664,15 @@ class TensorBlockMapper { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - GetBlockForIndex(Index block_index, Scalar* data) const { - Index first_coeff_index = 0; - DSizes coords; - DSizes sizes; - DSizes strides; + GetBlockForIndex(StorageIndex block_index, Scalar* data) const { + StorageIndex first_coeff_index = 0; + DSizes coords; + DSizes sizes; + DSizes strides; if (NumDims > 0) { if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { - const Index idx = block_index / m_block_strides[i]; + const StorageIndex idx = block_index / m_block_strides[i]; coords[i] = idx * m_block_dim_sizes[i]; sizes[i] = numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]); @@ -799,7 +690,7 @@ class TensorBlockMapper { } } else { for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = block_index / m_block_strides[i]; + const StorageIndex idx = block_index / m_block_strides[i]; coords[i] = idx * m_block_dim_sizes[i]; sizes[i] = numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]); @@ -824,19 +715,20 @@ class TensorBlockMapper { data); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const { return m_total_block_count; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index block_dims_total_size() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex + block_dims_total_size() const { return m_block_dim_sizes.TotalSize(); } private: static Dimensions BlockDimensions(const Dimensions& tensor_dims, const TensorBlockShapeType block_shape, - size_t min_target_size) { - min_target_size = numext::maxi(1, min_target_size); + Index min_target_size) { + min_target_size = numext::maxi(1, min_target_size); // If tensor fully fits into the target size, we'll treat it a single block. Dimensions block_dim_sizes = tensor_dims; @@ -865,14 +757,14 @@ class TensorBlockMapper { dim_size_target, static_cast(tensor_dims[i])); } // Add any un-allocated coefficients to inner dimension(s). - Index total_size = block_dim_sizes.TotalSize(); + StorageIndex total_size = block_dim_sizes.TotalSize(); for (int i = 0; i < NumDims; ++i) { const int dim = cond()(i, NumDims - i - 1); if (block_dim_sizes[dim] < tensor_dims[dim]) { - const Index total_size_other_dims = + const StorageIndex total_size_other_dims = total_size / block_dim_sizes[dim]; - const Index alloc_avail = - divup(min_target_size, total_size_other_dims); + const StorageIndex alloc_avail = + divup(min_target_size, total_size_other_dims); if (alloc_avail == block_dim_sizes[dim]) { // Insufficient excess coefficients to allocate. break; @@ -882,14 +774,14 @@ class TensorBlockMapper { } } } else if (block_shape == TensorBlockShapeType::kSkewedInnerDims) { - Index coeff_to_allocate = min_target_size; + StorageIndex coeff_to_allocate = min_target_size; for (int i = 0; i < NumDims; ++i) { const int dim = cond()(i, NumDims - i - 1); block_dim_sizes[dim] = numext::mini(coeff_to_allocate, tensor_dims[dim]); - coeff_to_allocate = - divup(coeff_to_allocate, - numext::maxi(static_cast(1), block_dim_sizes[dim])); + coeff_to_allocate = divup( + coeff_to_allocate, + numext::maxi(static_cast(1), block_dim_sizes[dim])); } eigen_assert(coeff_to_allocate == 1); } else { @@ -908,7 +800,7 @@ class TensorBlockMapper { Dimensions m_block_dim_sizes; Dimensions m_block_strides; Dimensions m_tensor_strides; - Index m_total_block_count; + StorageIndex m_total_block_count; }; /** @@ -923,12 +815,12 @@ class TensorBlockMapper { * processed together. * */ -template +template class TensorSliceBlockMapper { public: - typedef typename internal::TensorBlock + typedef typename internal::TensorBlock TensorBlock; - typedef DSizes Dimensions; + typedef DSizes Dimensions; TensorSliceBlockMapper(const Dimensions& tensor_dims, const Dimensions& tensor_slice_offsets, @@ -942,7 +834,7 @@ class TensorSliceBlockMapper { m_block_stride_order(block_stride_order), m_total_block_count(1) { // Calculate block counts by dimension and total block count. - DSizes block_count; + DSizes block_count; for (size_t i = 0; i < block_count.rank(); ++i) { block_count[i] = divup(m_tensor_slice_extents[i], m_block_dim_sizes[i]); } @@ -969,11 +861,11 @@ class TensorSliceBlockMapper { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - GetBlockForIndex(Index block_index, Scalar* data) const { - Index first_coeff_index = 0; - DSizes coords; - DSizes sizes; - DSizes strides; + GetBlockForIndex(StorageIndex block_index, Scalar* data) const { + StorageIndex first_coeff_index = 0; + DSizes coords; + DSizes sizes; + DSizes strides; if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx = block_index / m_block_strides[i]; @@ -991,16 +883,16 @@ class TensorSliceBlockMapper { m_block_dim_sizes[0]); first_coeff_index += coords[0] * m_tensor_strides[0]; - Index prev_dim = m_block_stride_order[0]; + StorageIndex prev_dim = m_block_stride_order[0]; strides[prev_dim] = 1; for (int i = 1; i < NumDims; ++i) { - const Index curr_dim = m_block_stride_order[i]; + const StorageIndex curr_dim = m_block_stride_order[i]; strides[curr_dim] = strides[prev_dim] * sizes[prev_dim]; prev_dim = curr_dim; } } else { for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = block_index / m_block_strides[i]; + const StorageIndex idx = block_index / m_block_strides[i]; coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i]; sizes[i] = numext::mini( m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i], @@ -1016,10 +908,10 @@ class TensorSliceBlockMapper { m_block_dim_sizes[NumDims - 1]); first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1]; - Index prev_dim = m_block_stride_order[NumDims - 1]; + StorageIndex prev_dim = m_block_stride_order[NumDims - 1]; strides[prev_dim] = 1; for (int i = NumDims - 2; i >= 0; --i) { - const Index curr_dim = m_block_stride_order[i]; + const StorageIndex curr_dim = m_block_stride_order[i]; strides[curr_dim] = strides[prev_dim] * sizes[prev_dim]; prev_dim = curr_dim; } @@ -1029,7 +921,7 @@ class TensorSliceBlockMapper { data); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const { return m_total_block_count; } @@ -1041,7 +933,7 @@ class TensorSliceBlockMapper { Dimensions m_block_dim_sizes; Dimensions m_block_stride_order; Dimensions m_block_strides; - Index m_total_block_count; + StorageIndex m_total_block_count; }; } // namespace internal diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 7ff0d323b..343ab6269 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -1,5 +1,4 @@ // This file is part of Eigen, a lightweight C++ template library -// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2014 Benoit Steiner diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index ba02802d2..f9a1bd68c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -51,12 +51,10 @@ struct TensorEvaluator typename internal::remove_const::type, Index, NumCoords, Layout> TensorBlock; typedef typename internal::TensorBlockReader< - typename internal::remove_const::type, Index, NumCoords, Layout, - PacketAccess> + typename internal::remove_const::type, Index, NumCoords, Layout> TensorBlockReader; typedef typename internal::TensorBlockWriter< - typename internal::remove_const::type, Index, NumCoords, Layout, - PacketAccess> + typename internal::remove_const::type, Index, NumCoords, Layout> TensorBlockWriter; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) @@ -204,8 +202,7 @@ struct TensorEvaluator typename internal::remove_const::type, Index, NumCoords, Layout> TensorBlock; typedef typename internal::TensorBlockReader< - typename internal::remove_const::type, Index, NumCoords, Layout, - PacketAccess> + typename internal::remove_const::type, Index, NumCoords, Layout> TensorBlockReader; // Used for accessor extraction in SYCL Managed TensorMap: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 024de3696..ac5afd891 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -36,15 +36,16 @@ template class TensorExecutor { public: - typedef typename Expression::Index Index; + using StorageIndex = typename Expression::Index; + EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const Device& device = Device()) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const Index size = array_prod(evaluator.dimensions()); - for (Index i = 0; i < size; ++i) { + const StorageIndex size = array_prod(evaluator.dimensions()); + for (StorageIndex i = 0; i < size; ++i) { evaluator.evalScalar(i); } } @@ -56,35 +57,36 @@ class TensorExecutor { * Process all the data with a single cpu thread, using vectorized instructions. */ template -class TensorExecutor { +class TensorExecutor { public: - typedef typename Expression::Index Index; + using StorageIndex = typename Expression::Index; EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) - { + static inline void run(const Expression& expr, + const DefaultDevice& device = DefaultDevice()) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); const int PacketSize = unpacket_traits::PacketReturnType>::size; // Give compiler a strong possibility to unroll the loop. But don't insist // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. - const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; - for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { + const StorageIndex UnrolledSize = + (size / (4 * PacketSize)) * 4 * PacketSize; + for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { evaluator.evalPacket(i + j * PacketSize); } } - const Index VectorizedSize = (size / PacketSize) * PacketSize; - for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { + const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize; + for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) { evaluator.evalPacket(i); } - for (Index i = VectorizedSize; i < size; ++i) { + for (StorageIndex i = VectorizedSize; i < size; ++i) { evaluator.evalScalar(i); } } @@ -97,42 +99,41 @@ class TensorExecutor -class TensorExecutor { +class TensorExecutor { public: - typedef typename Expression::Index Index; + using Scalar = typename traits::Scalar; + using ScalarNoConst = typename remove_const::type; + + using Evaluator = TensorEvaluator; + using StorageIndex = typename traits::Index; + + static const int NumDims = traits::NumDimensions; EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) { - using Evaluator = TensorEvaluator; - - using Index = typename traits::Index; - const int NumDims = traits::NumDimensions; - - using Scalar = typename traits::Scalar; - using ScalarNoConst = typename remove_const::type; - using TensorBlock = - TensorBlock; - using TensorBlockMapper = - TensorBlockMapper; + TensorBlock; + using TensorBlockMapper = TensorBlockMapper; Evaluator evaluator(expr, device); - std::size_t total_size = array_prod(evaluator.dimensions()); - std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + Index total_size = array_prod(evaluator.dimensions()); + Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar); if (total_size < cache_size) { // TODO(andydavis) Reduce block management overhead for small tensors. // TODO(wuke) Do not do this when evaluating TensorBroadcastingOp. internal::TensorExecutor::run(expr, device); + /*Tileable*/ false>::run(expr, device); return; } const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { // Size tensor blocks to fit in cache (or requested target block size). - size_t block_total_size = numext::mini(cache_size, total_size); + Index block_total_size = numext::mini(cache_size, total_size); TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims; // Query expression tree for desired block size/shape. std::vector resources; @@ -146,8 +147,8 @@ class TensorExecutor Scalar* data = static_cast( device.allocate(block_total_size * sizeof(Scalar))); - const Index total_block_count = block_mapper.total_block_count(); - for (Index i = 0; i < total_block_count; ++i) { + const StorageIndex total_block_count = block_mapper.total_block_count(); + for (StorageIndex i = 0; i < total_block_count; ++i) { TensorBlock block = block_mapper.GetBlockForIndex(i, data); evaluator.evalBlock(&block); } @@ -162,37 +163,38 @@ class TensorExecutor * executed on a single core. */ #ifdef EIGEN_USE_THREADS -template +template struct EvalRange { - static void run(Evaluator* evaluator_in, const Index first, const Index last) { + static void run(Evaluator* evaluator_in, const StorageIndex first, + const StorageIndex last) { Evaluator evaluator = *evaluator_in; eigen_assert(last >= first); - for (Index i = first; i < last; ++i) { + for (StorageIndex i = first; i < last; ++i) { evaluator.evalScalar(i); } } - static Index alignBlockSize(Index size) { - return size; - } + static StorageIndex alignBlockSize(StorageIndex size) { return size; } }; -template -struct EvalRange { - static const int PacketSize = unpacket_traits::size; +template +struct EvalRange { + static const int PacketSize = + unpacket_traits::size; - static void run(Evaluator* evaluator_in, const Index first, const Index last) { + static void run(Evaluator* evaluator_in, const StorageIndex first, + const StorageIndex last) { Evaluator evaluator = *evaluator_in; eigen_assert(last >= first); - Index i = first; + StorageIndex i = first; if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); - Index last_chunk_offset = last - 4 * PacketSize; + StorageIndex last_chunk_offset = last - 4 * PacketSize; // Give compiler a strong possibility to unroll the loop. But don't insist // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. - for (; i <= last_chunk_offset; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { + for (; i <= last_chunk_offset; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { evaluator.evalPacket(i + j * PacketSize); } } @@ -206,7 +208,7 @@ struct EvalRange { } } - static Index alignBlockSize(Index size) { + static StorageIndex alignBlockSize(StorageIndex size) { // Align block size to packet size and account for unrolling in run above. if (size >= 16 * PacketSize) { return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); @@ -219,24 +221,24 @@ struct EvalRange { template class TensorExecutor { public: - typedef typename Expression::Index Index; + using StorageIndex = typename Expression::Index; static inline void run(const Expression& expr, const ThreadPoolDevice& device) { typedef TensorEvaluator Evaluator; - typedef EvalRange EvalRange; + typedef EvalRange EvalRange; Evaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); if (needs_assign) { - const Index PacketSize = + const StorageIndex PacketSize = Vectorizable ? unpacket_traits::size : 1; - const Index size = array_prod(evaluator.dimensions()); + const StorageIndex size = array_prod(evaluator.dimensions()); device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize, - [&evaluator](Index first, Index last) { + [&evaluator](StorageIndex first, StorageIndex last) { EvalRange::run(&evaluator, first, last); }); } @@ -247,24 +249,24 @@ class TensorExecutor { template class TensorExecutor { public: - typedef typename Expression::Index Index; + using Scalar = typename traits::Scalar; + using ScalarNoConst = typename remove_const::type; - static inline void run(const Expression& expr, - const ThreadPoolDevice& device) { - typedef TensorEvaluator Evaluator; - typedef typename internal::remove_const< - typename traits::Scalar>::type Scalar; - typedef typename traits::Index Index; + using Evaluator = TensorEvaluator; + using StorageIndex = typename traits::Index; - static const int NumDims = traits::NumDimensions; + static const int NumDims = traits::NumDimensions; - typedef TensorBlock TensorBlock; - typedef TensorBlockMapper - TensorBlockMapper; + static inline void run(const Expression& expr, + const ThreadPoolDevice& device) { + using TensorBlock = + TensorBlock; + using TensorBlockMapper = + TensorBlockMapper; Evaluator evaluator(expr, device); - std::size_t total_size = array_prod(evaluator.dimensions()); - std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + StorageIndex total_size = array_prod(evaluator.dimensions()); + StorageIndex cache_size = device.firstLevelCacheSize() / sizeof(Scalar); if (total_size < cache_size) { // TODO(andydavis) Reduce block management overhead for small tensors. internal::TensorExecutor resources; evaluator.getResourceRequirements(&resources); @@ -296,15 +298,16 @@ class TensorExecutor= -1 && thread_idx < num_threads); Scalar* thread_buf = reinterpret_cast( static_cast(buf) + aligned_blocksize * (thread_idx + 1)); - for (Index i = first; i < last; ++i) { + for (StorageIndex i = first; i < last; ++i) { auto block = block_mapper.GetBlockForIndex(i, thread_buf); evaluator.evalBlock(&block); } @@ -324,51 +327,51 @@ class TensorExecutor class TensorExecutor { public: - typedef typename Expression::Index Index; + typedef typename Expression::Index StorageIndex; static void run(const Expression& expr, const GpuDevice& device); }; #if defined(EIGEN_GPUCC) -template +template struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - for (Index i = first; i < last; i += step_size) { + void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) { + for (StorageIndex i = first; i < last; i += step_size) { eval.evalScalar(i); } } }; -template -struct EigenMetaKernelEval { +template +struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - const Index PacketSize = unpacket_traits::size; - const Index vectorized_size = (last / PacketSize) * PacketSize; - const Index vectorized_step_size = step_size * PacketSize; + void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) { + const StorageIndex PacketSize = unpacket_traits::size; + const StorageIndex vectorized_size = (last / PacketSize) * PacketSize; + const StorageIndex vectorized_step_size = step_size * PacketSize; // Use the vector path - for (Index i = first * PacketSize; i < vectorized_size; + for (StorageIndex i = first * PacketSize; i < vectorized_size; i += vectorized_step_size) { eval.evalPacket(i); } - for (Index i = vectorized_size + first; i < last; i += step_size) { + for (StorageIndex i = vectorized_size + first; i < last; i += step_size) { eval.evalScalar(i); } } }; -template +template __global__ void __launch_bounds__(1024) -EigenMetaKernel(Evaluator eval, Index size) { +EigenMetaKernel(Evaluator eval, StorageIndex size) { - const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; - const Index step_size = blockDim.x * gridDim.x; + const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x; + const StorageIndex step_size = blockDim.x * gridDim.x; const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; - EigenMetaKernelEval::run(eval, first_index, size, step_size); + EigenMetaKernelEval::run(eval, first_index, size, step_size); } /*static*/ @@ -382,12 +385,12 @@ inline void TensorExecutor::run( const int block_size = device.maxGpuThreadsPerBlock(); const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size; - const Index size = array_prod(evaluator.dimensions()); + const StorageIndex size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); LAUNCH_GPU_KERNEL( - (EigenMetaKernel, Index>), + (EigenMetaKernel, StorageIndex>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp index 416b686e4..6feeff231 100644 --- a/unsupported/test/cxx11_tensor_block_access.cpp +++ b/unsupported/test/cxx11_tensor_block_access.cpp @@ -37,6 +37,31 @@ static std::size_t RandomTargetSize(const DSizes& dims) { return internal::random(1, dims.TotalSize()); } +template +static DSizes RandomDims() { + array dims; + for (int i = 0; i < NumDims; ++i) { + dims[i] = internal::random(1, 20); + } + return DSizes(dims); +}; + +/** Dummy data type to test TensorBlock copy ops. */ +struct Data { + Data() : Data(0) {} + explicit Data(int v) { value = v; } + int value; +}; + +bool operator==(const Data& lhs, const Data& rhs) { + return lhs.value == rhs.value; +} + +std::ostream& operator<<(std::ostream& os, const Data& d) { + os << "Data: value=" << d.value; + return os; +} + template static T* GenerateRandomData(const Index& size) { T* data = new T[size]; @@ -46,6 +71,23 @@ static T* GenerateRandomData(const Index& size) { return data; } +template <> +Data* GenerateRandomData(const Index& size) { + Data* data = new Data[size]; + for (int i = 0; i < size; ++i) { + data[i] = Data(internal::random(1, 100)); + } + return data; +} + +template +static void Debug(DSizes dims) { + for (int i = 0; i < NumDims; ++i) { + std::cout << dims[i] << "; "; + } + std::cout << std::endl; +} + template static void test_block_mapper_sanity() { @@ -96,7 +138,7 @@ static void test_block_mapper_sanity() // index in the visited set. Verify that every coeff accessed only once. template static void UpdateCoeffSet( - const internal::TensorBlock& block, + const internal::TensorBlock& block, Index first_coeff_index, int dim_index, std::set* visited_coeffs) { const DSizes block_sizes = block.block_sizes(); const DSizes tensor_strides = block.tensor_strides(); @@ -114,14 +156,13 @@ static void UpdateCoeffSet( } } -template -static void test_block_mapper_maps_every_element() -{ - using T = int; - using TensorBlock = internal::TensorBlock; - using TensorBlockMapper = internal::TensorBlockMapper; +template +static void test_block_mapper_maps_every_element() { + using TensorBlock = internal::TensorBlock; + using TensorBlockMapper = + internal::TensorBlockMapper; - DSizes dims(5, 7, 11, 17); + DSizes dims = RandomDims(); // Keep track of elements indices available via block access. std::set coeff_set; @@ -131,29 +172,36 @@ static void test_block_mapper_maps_every_element() for (int i = 0; i < block_mapper.total_block_count(); ++i) { TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr); - UpdateCoeffSet(block, block.first_coeff_index(), - choose(Layout, 3, 0), &coeff_set); + UpdateCoeffSet(block, block.first_coeff_index(), + choose(Layout, NumDims - 1, 0), + &coeff_set); } // Verify that every coefficient in the original Tensor is accessible through // TensorBlock only once. - auto total_coeffs = static_cast(dims.TotalSize()); + Index total_coeffs = dims.TotalSize(); VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs); - VERIFY_IS_EQUAL(*coeff_set.begin(), static_cast(0)); - VERIFY_IS_EQUAL(*coeff_set.rbegin(), static_cast(total_coeffs - 1)); + VERIFY_IS_EQUAL(*coeff_set.begin(), 0); + VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1); } -template -static void test_slice_block_mapper_maps_every_element() -{ - using T = int; - using TensorBlock = internal::TensorBlock; +template +static void test_slice_block_mapper_maps_every_element() { + using TensorBlock = internal::TensorBlock; using TensorSliceBlockMapper = - internal::TensorSliceBlockMapper; + internal::TensorSliceBlockMapper; - DSizes tensor_dims(5,7,11,17); - DSizes tensor_slice_offsets(1,3,5,7); - DSizes tensor_slice_extents(3,2,4,5); + DSizes tensor_dims = RandomDims(); + DSizes tensor_slice_offsets = RandomDims(); + DSizes tensor_slice_extents = RandomDims(); + + // Make sure that tensor offsets + extents do not overflow. + for (int i = 0; i < NumDims; ++i) { + tensor_slice_offsets[i] = + numext::mini(tensor_dims[i] - 1, tensor_slice_offsets[i]); + tensor_slice_extents[i] = numext::mini( + tensor_slice_extents[i], tensor_dims[i] - tensor_slice_offsets[i]); + } // Keep track of elements indices available via block access. std::set coeff_set; @@ -161,61 +209,59 @@ static void test_slice_block_mapper_maps_every_element() auto total_coeffs = static_cast(tensor_slice_extents.TotalSize()); // Pick a random dimension sizes for the tensor blocks. - DSizes block_sizes; - for (int i = 0; i < 4; ++i) { + DSizes block_sizes; + for (int i = 0; i < NumDims; ++i) { block_sizes[i] = internal::random(1, tensor_slice_extents[i]); } TensorSliceBlockMapper block_mapper(tensor_dims, tensor_slice_offsets, tensor_slice_extents, block_sizes, - DimensionList()); + DimensionList()); for (int i = 0; i < block_mapper.total_block_count(); ++i) { TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr); - UpdateCoeffSet(block, block.first_coeff_index(), - choose(Layout, 3, 0), &coeff_set); + UpdateCoeffSet(block, block.first_coeff_index(), + choose(Layout, NumDims - 1, 0), + &coeff_set); } VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs); } -template -static void test_block_io_copy_data_from_source_to_target() -{ - using T = float; - - typedef internal::TensorBlock TensorBlock; - typedef internal::TensorBlockMapper TensorBlockMapper; +template +static void test_block_io_copy_data_from_source_to_target() { + typedef internal::TensorBlock TensorBlock; + typedef internal::TensorBlockMapper + TensorBlockMapper; - typedef internal::TensorBlockReader + typedef internal::TensorBlockReader TensorBlockReader; - typedef internal::TensorBlockWriter + typedef internal::TensorBlockWriter TensorBlockWriter; - typedef std::vector> DataVector; - - DSizes input_tensor_dims(5, 7, 11, 17, 3); + DSizes input_tensor_dims = RandomDims(); const auto input_tensor_size = input_tensor_dims.TotalSize(); - DataVector input_data(input_tensor_size, 0); - for (int i = 0; i < input_tensor_size; ++i) { - input_data[i] = internal::random(); - } - DataVector output_data(input_tensor_size, 0); + T* input_data = GenerateRandomData(input_tensor_size); + T* output_data = new T[input_tensor_size]; TensorBlockMapper block_mapper(input_tensor_dims, RandomShape(), RandomTargetSize(input_tensor_dims)); + T* block_data = new T[block_mapper.block_dims_total_size()]; - DataVector block_data(block_mapper.block_dims_total_size(), 0); for (int i = 0; i < block_mapper.total_block_count(); ++i) { - TensorBlock block = block_mapper.GetBlockForIndex(i, block_data.data()); - TensorBlockReader::Run(&block, input_data.data()); - TensorBlockWriter::Run(block, output_data.data()); + TensorBlock block = block_mapper.GetBlockForIndex(i, block_data); + TensorBlockReader::Run(&block, input_data); + TensorBlockWriter::Run(block, output_data); } for (int i = 0; i < input_tensor_size; ++i) { VERIFY_IS_EQUAL(input_data[i], output_data[i]); } + + delete[] input_data; + delete[] output_data; + delete[] block_data; } template @@ -261,31 +307,32 @@ static array ComputeStrides( return strides; } -template +template static void test_block_io_copy_using_reordered_dimensions() { - typedef internal::TensorBlock TensorBlock; - typedef internal::TensorBlockMapper + typedef internal::TensorBlock TensorBlock; + typedef internal::TensorBlockMapper TensorBlockMapper; - typedef internal::TensorBlockReader + typedef internal::TensorBlockReader TensorBlockReader; - typedef internal::TensorBlockWriter + typedef internal::TensorBlockWriter TensorBlockWriter; - DSizes input_tensor_dims(5, 7, 11, 17, 3); + DSizes input_tensor_dims = RandomDims(); const auto input_tensor_size = input_tensor_dims.TotalSize(); // Create a random input tensor. - auto* input_data = GenerateRandomData(input_tensor_size); + T* input_data = GenerateRandomData(input_tensor_size); // Create a random dimension re-ordering/shuffle. - std::vector shuffle = {0, 1, 2, 3, 4}; + std::vector shuffle; + for (int i = 0; i < NumDims; ++i) shuffle.push_back(i); std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937()); - DSizes output_tensor_dims; - array input_to_output_dim_map; - array output_to_input_dim_map; - for (Index i = 0; i < 5; ++i) { + DSizes output_tensor_dims; + array input_to_output_dim_map; + array output_to_input_dim_map; + for (Index i = 0; i < NumDims; ++i) { output_tensor_dims[shuffle[i]] = input_tensor_dims[i]; input_to_output_dim_map[i] = shuffle[i]; output_to_input_dim_map[shuffle[i]] = i; @@ -295,17 +342,17 @@ static void test_block_io_copy_using_reordered_dimensions() { TensorBlockMapper block_mapper(output_tensor_dims, RandomShape(), RandomTargetSize(input_tensor_dims)); - auto* block_data = new float[block_mapper.block_dims_total_size()]; - auto* output_data = new float[input_tensor_size]; + auto* block_data = new T[block_mapper.block_dims_total_size()]; + auto* output_data = new T[input_tensor_size]; - array input_tensor_strides = - ComputeStrides(input_tensor_dims); - array output_tensor_strides = - ComputeStrides(output_tensor_dims); + array input_tensor_strides = + ComputeStrides(input_tensor_dims); + array output_tensor_strides = + ComputeStrides(output_tensor_dims); for (Index i = 0; i < block_mapper.total_block_count(); ++i) { TensorBlock block = block_mapper.GetBlockForIndex(i, block_data); - const Index first_coeff_index = GetInputIndex( + const Index first_coeff_index = GetInputIndex( block.first_coeff_index(), output_to_input_dim_map, input_tensor_strides, output_tensor_strides); TensorBlockReader::Run(&block, first_coeff_index, input_to_output_dim_map, @@ -327,18 +374,21 @@ template static void test_block_io_zero_stride() { typedef internal::TensorBlock TensorBlock; - typedef internal::TensorBlockReader + typedef internal::TensorBlockReader TensorBlockReader; - typedef internal::TensorBlockWriter + typedef internal::TensorBlockWriter TensorBlockWriter; - DSizes input_tensor_dims(1, 2, 1, 3, 1); - const auto input_tensor_size = input_tensor_dims.TotalSize(); + DSizes rnd_dims = RandomDims<5>(); - // Create a random input tensor. + DSizes input_tensor_dims = rnd_dims; + input_tensor_dims[0] = 1; + input_tensor_dims[2] = 1; + input_tensor_dims[4] = 1; + const auto input_tensor_size = input_tensor_dims.TotalSize(); auto* input_data = GenerateRandomData(input_tensor_size); - DSizes output_tensor_dims(3, 2, 3, 3, 2); + DSizes output_tensor_dims = rnd_dims; DSizes input_tensor_strides( ComputeStrides(input_tensor_dims)); @@ -401,9 +451,9 @@ static void test_block_io_zero_stride() template static void test_block_io_squeeze_ones() { typedef internal::TensorBlock TensorBlock; - typedef internal::TensorBlockReader + typedef internal::TensorBlockReader TensorBlockReader; - typedef internal::TensorBlockWriter + typedef internal::TensorBlockWriter TensorBlockWriter; // Total size > 1. @@ -467,23 +517,23 @@ static void test_block_io_squeeze_ones() { } } -template +template static void test_block_cwise_binary_io_basic() { - typedef internal::scalar_sum_op BinaryFunctor; - typedef internal::TensorBlockCwiseBinaryIO BinaryFunctor; + typedef internal::TensorBlockCwiseBinaryIO TensorBlockCwiseBinaryIO; - DSizes block_sizes(2, 3, 5, 7, 11); - DSizes strides(ComputeStrides(block_sizes)); + DSizes block_sizes = RandomDims(); + DSizes strides(ComputeStrides(block_sizes)); const auto total_size = block_sizes.TotalSize(); // Create a random input tensors. - auto* left_data = GenerateRandomData(total_size); - auto* right_data = GenerateRandomData(total_size); + T* left_data = GenerateRandomData(total_size); + T* right_data = GenerateRandomData(total_size); - auto* output_data = new float[total_size]; + T* output_data = new T[total_size]; BinaryFunctor functor; TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data, strides, left_data, strides, right_data); @@ -532,13 +582,22 @@ static void test_block_cwise_binary_io_zero_strides() { Layout> TensorBlockCwiseBinaryIO; - DSizes left_sizes(1, 3, 1, 7, 1); + DSizes rnd_dims = RandomDims<5>(); + + DSizes left_sizes = rnd_dims; + left_sizes[0] = 1; + left_sizes[2] = 1; + left_sizes[4] = 1; + DSizes left_strides(ComputeStrides(left_sizes)); left_strides[0] = 0; left_strides[2] = 0; left_strides[4] = 0; - DSizes right_sizes(2, 1, 5, 1, 11); + DSizes right_sizes = rnd_dims; + right_sizes[1] = 0; + right_sizes[3] = 0; + DSizes right_strides(ComputeStrides(right_sizes)); right_strides[1] = 0; right_strides[3] = 0; @@ -547,7 +606,7 @@ static void test_block_cwise_binary_io_zero_strides() { auto* left_data = GenerateRandomData(left_sizes.TotalSize()); auto* right_data = GenerateRandomData(right_sizes.TotalSize()); - DSizes output_sizes(2, 3, 5, 7, 11); + DSizes output_sizes = rnd_dims; DSizes output_strides(ComputeStrides(output_sizes)); const auto output_total_size = output_sizes.TotalSize(); @@ -557,11 +616,11 @@ static void test_block_cwise_binary_io_zero_strides() { TensorBlockCwiseBinaryIO::Run(functor, output_sizes, output_strides, output_data, left_strides, left_data, right_strides, right_data); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { - for (int m = 0; m < 11; ++m) { + for (int i = 0; i < rnd_dims[0]; ++i) { + for (int j = 0; j < rnd_dims[1]; ++j) { + for (int k = 0; k < rnd_dims[2]; ++k) { + for (int l = 0; l < rnd_dims[3]; ++l) { + for (int m = 0; m < rnd_dims[4]; ++m) { Index output_index = i * output_strides[0] + j * output_strides[1] + k * output_strides[2] + l * output_strides[3] + m * output_strides[4]; @@ -893,31 +952,44 @@ static void test_empty_dims(const internal::TensorBlockShapeType block_shape) } } -#define CALL_SUBTEST_LAYOUTS(NAME) \ +#define TEST_LAYOUTS(NAME) \ CALL_SUBTEST(NAME()); \ CALL_SUBTEST(NAME()) -#define CALL_SUBTEST_LAYOUTS_WITH_ARG(NAME, ARG) \ +#define TEST_LAYOUTS_AND_DIMS(TYPE, NAME) \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())); \ + CALL_SUBTEST((NAME())) + +#define TEST_LAYOUTS_WITH_ARG(NAME, ARG) \ CALL_SUBTEST(NAME(ARG)); \ CALL_SUBTEST(NAME(ARG)) EIGEN_DECLARE_TEST(cxx11_tensor_block_access) { - CALL_SUBTEST_LAYOUTS(test_block_mapper_sanity); - CALL_SUBTEST_LAYOUTS(test_block_mapper_maps_every_element); - CALL_SUBTEST_LAYOUTS(test_slice_block_mapper_maps_every_element); - CALL_SUBTEST_LAYOUTS(test_block_io_copy_data_from_source_to_target); - CALL_SUBTEST_LAYOUTS(test_block_io_copy_using_reordered_dimensions); - CALL_SUBTEST_LAYOUTS(test_block_io_zero_stride); - CALL_SUBTEST_LAYOUTS(test_block_io_squeeze_ones); - CALL_SUBTEST_LAYOUTS(test_block_cwise_binary_io_basic); - CALL_SUBTEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones); - CALL_SUBTEST_LAYOUTS(test_block_cwise_binary_io_zero_strides); - CALL_SUBTEST_LAYOUTS(test_uniform_block_shape); - CALL_SUBTEST_LAYOUTS(test_skewed_inner_dim_block_shape); - - CALL_SUBTEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims); - CALL_SUBTEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims); + TEST_LAYOUTS(test_block_mapper_sanity); + TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element); + TEST_LAYOUTS_AND_DIMS(float, test_slice_block_mapper_maps_every_element); + TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_data_from_source_to_target); + TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_data_from_source_to_target); + TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_using_reordered_dimensions); + TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_using_reordered_dimensions); + TEST_LAYOUTS(test_block_io_zero_stride); + TEST_LAYOUTS(test_block_io_squeeze_ones); + TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_binary_io_basic); + TEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones); + TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides); + TEST_LAYOUTS(test_uniform_block_shape); + TEST_LAYOUTS(test_skewed_inner_dim_block_shape); + TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims); + TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims); } -#undef CALL_SUBTEST_LAYOUTS -#undef CALL_SUBTEST_LAYOUTS_WITH_ARG \ No newline at end of file +#undef TEST_LAYOUTS +#undef TEST_LAYOUTS_WITH_ARG \ No newline at end of file diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index 5ae45ac5b..274f901ce 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -13,7 +13,6 @@ #include -using Eigen::Index; using Eigen::Tensor; using Eigen::RowMajor; using Eigen::ColMajor; @@ -25,9 +24,16 @@ template static void test_execute_binary_expr(Device d) { // Pick a large enough tensor size to bypass small tensor block evaluation // optimization. - Tensor lhs(840, 390, 37); - Tensor rhs(840, 390, 37); - Tensor dst(840, 390, 37); + int d0 = internal::random(100, 200); + int d1 = internal::random(100, 200); + int d2 = internal::random(100, 200); + + static constexpr int Options = 0; + using IndexType = int; + + Tensor lhs(d0, d1, d2); + Tensor rhs(d0, d1, d2); + Tensor dst(d0, d1, d2); lhs.setRandom(); rhs.setRandom(); @@ -40,9 +46,9 @@ static void test_execute_binary_expr(Device d) { Executor::run(Assign(dst, expr), d); - for (int i = 0; i < 840; ++i) { - for (int j = 0; j < 390; ++j) { - for (int k = 0; k < 37; ++k) { + for (int i = 0; i < d0; ++i) { + for (int j = 0; j < d1; ++j) { + for (int k = 0; k < d2; ++k) { float sum = lhs(i, j, k) + rhs(i, j, k); VERIFY_IS_EQUAL(sum, dst(i, j, k)); } -- cgit v1.2.3