diff options
author | Gael Guennebaud <g.gael@free.fr> | 2018-07-31 08:13:00 +0000 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2018-07-31 08:13:00 +0000 |
commit | 678a0dcb12d55e1d85aade7b34c706b2a5d2d49e (patch) | |
tree | 72540698831395b25a5f481ed5182ccbee7e0129 | |
parent | 679eece8760ce9b9ff09e48b6ee8673afcf94caa (diff) | |
parent | 966c2a7bb62a8b5b9ecd349730ffcd3b5719837d (diff) |
Merged in ezhulenev/eigen/tiling_3 (pull request PR-438)
Tiled tensor executor
33 files changed, 1989 insertions, 246 deletions
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 397d55f76..47514703a 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -112,13 +112,13 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorGlobalFunctions.h" #include "src/Tensor/TensorBase.h" +#include "src/Tensor/TensorBlock.h" #include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorExpr.h" #include "src/Tensor/TensorReduction.h" #include "src/Tensor/TensorReductionGpu.h" #include "src/Tensor/TensorArgMax.h" -#include "src/Tensor/TensorBlock.h" #include "src/Tensor/TensorConcatenation.h" #include "src/Tensor/TensorContractionMapper.h" #include "src/Tensor/TensorContractionBlocking.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 027305586..199ddb123 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -68,6 +68,8 @@ class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType> typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind; typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index; + static const int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} @@ -95,20 +97,33 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions; + static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + static const int NumDims = XprType::NumDims; enum { - IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess, - Layout = TensorEvaluator<LeftArgType, Device>::Layout, - RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess + IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & + TensorEvaluator<RightArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & + TensorEvaluator<RightArgType, Device>::PacketAccess, + BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess & + TensorEvaluator<RightArgType, Device>::BlockAccess, + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess }; + typedef typename internal::TensorBlock< + typename internal::remove_const<Scalar>::type, Index, NumDims, Layout> + TensorBlock; + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { - EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT( + (static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == + static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); } EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -164,6 +179,25 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + m_leftImpl.getResourceRequirements(resources); + m_rightImpl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) { + if (TensorEvaluator<LeftArgType, Device>::RawAccess && + m_leftImpl.data() != nullptr) { + TensorBlock left_block(block->first_coeff_index(), block->block_sizes(), + block->tensor_strides(), block->tensor_strides(), + m_leftImpl.data() + block->first_coeff_index()); + m_rightImpl.block(&left_block); + } else { + m_rightImpl.block(block); + m_leftImpl.writeBlock(*block); + } + } + /// required by sycl in order to extract the accessor const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; } /// required by sycl in order to extract the accessor diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 59535cd91..84cf6d216 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -14,6 +14,32 @@ namespace Eigen { namespace internal { +namespace { + +// Helper template to choose between ColMajor and RowMajor values. +template <int Layout> +struct cond; + +template <> +struct cond<ColMajor> { + template <typename T> + EIGEN_STRONG_INLINE const T& operator()(const T& col, + const T& /*row*/) const { + return col; + } +}; + +template <> +struct cond<RowMajor> { + template <typename T> + EIGEN_STRONG_INLINE const T& operator()(const T& /*col*/, + const T& row) const { + return row; + } +}; + +} // namespace + /** * \class TensorBlockShapeType * \ingroup CXX11_Tensor_Module @@ -39,6 +65,40 @@ enum class TensorBlockShapeType { kSkewedInnerDims, }; +struct TensorOpResourceRequirements { + TensorBlockShapeType block_shape; + Index block_total_size; + // TODO(andydavis) Add 'target_num_threads' to support communication of + // thread-resource requirements. This will allow ops deep in the + // expression tree (like reductions) to communicate resources + // requirements based on local state (like the total number of reductions + // to be computed). + TensorOpResourceRequirements(internal::TensorBlockShapeType shape, + const Index size) + : block_shape(shape), block_total_size(size) {} +}; + +// Tries to merge multiple resource requirements. +EIGEN_STRONG_INLINE void MergeResourceRequirements( + const std::vector<TensorOpResourceRequirements>& resources, + TensorBlockShapeType* block_shape, Index* block_total_size) { + if (resources.empty()) { + return; + } + // TODO(andydavis) Implement different policies (i.e. revert to a default + // policy if block shapes/sizes conflict). + *block_shape = resources[0].block_shape; + *block_total_size = resources[0].block_total_size; + for (int i = 1; i < resources.size(); ++i) { + if (resources[i].block_shape == TensorBlockShapeType::kSkewedInnerDims && + *block_shape != TensorBlockShapeType::kSkewedInnerDims) { + *block_shape = TensorBlockShapeType::kSkewedInnerDims; + } + *block_total_size = + numext::maxi(*block_total_size, resources[i].block_total_size); + } +} + /** * \class TensorBlock * \ingroup CXX11_Tensor_Module @@ -48,12 +108,12 @@ enum class TensorBlockShapeType { * This class represents a tensor block specified by the index of the * first block coefficient, and the size of the block in each dimension. */ -template <typename Scalar, typename Index, std::size_t NumDims, int Layout> +template <typename Scalar, typename StorageIndex, int NumDims, int Layout> class TensorBlock { public: - typedef DSizes<Index, NumDims> Dimensions; + typedef DSizes<StorageIndex, NumDims> Dimensions; - TensorBlock(const Index first_coeff_index, const Dimensions& block_sizes, + TensorBlock(const StorageIndex first_coeff_index, const Dimensions& block_sizes, const Dimensions& block_strides, const Dimensions& tensor_strides, Scalar* data) : m_first_coeff_index(first_coeff_index), @@ -62,7 +122,7 @@ class TensorBlock { m_tensor_strides(tensor_strides), m_data(data) {} - Index first_coeff_index() const { return m_first_coeff_index; } + StorageIndex first_coeff_index() const { return m_first_coeff_index; } const Dimensions& block_sizes() const { return m_block_sizes; } @@ -75,13 +135,487 @@ class TensorBlock { const Scalar* data() const { return m_data; } private: - Index m_first_coeff_index; + StorageIndex m_first_coeff_index; Dimensions m_block_sizes; Dimensions m_block_strides; Dimensions m_tensor_strides; Scalar* m_data; // Not owned. }; +template <typename Scalar, typename StorageIndex> +struct TensorBlockCopyOp { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const StorageIndex num_coeff_to_copy, const StorageIndex dst_index, + const StorageIndex dst_stride, Scalar* EIGEN_RESTRICT dst_data, + const StorageIndex src_index, const StorageIndex src_stride, + const Scalar* EIGEN_RESTRICT src_data) { + const Scalar* src_base = &src_data[src_index]; + Scalar* dst_base = &dst_data[dst_index]; + + using Src = const Eigen::Array<Scalar, Dynamic, 1>; + using Dst = Eigen::Array<Scalar, Dynamic, 1>; + + using SrcMap = Eigen::Map<Src, 0, InnerStride<>>; + using DstMap = Eigen::Map<Dst, 0, InnerStride<>>; + + const SrcMap src(src_base, num_coeff_to_copy, InnerStride<>(src_stride)); + DstMap dst(dst_base, num_coeff_to_copy, InnerStride<>(dst_stride)); + + dst = src; + } +}; + +/** + * \class TensorBlockIO + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor block IO class. + * + * This class is responsible for copying data between a tensor and a tensor + * block. + */ +template <typename Scalar, typename StorageIndex, int NumDims, int Layout, + bool BlockRead> +class TensorBlockIO { + public: + typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout> + TensorBlock; + typedef typename internal::TensorBlockCopyOp<Scalar, StorageIndex> + TensorBlockCopyOp; + + protected: + struct BlockIteratorState { + StorageIndex input_stride; + StorageIndex output_stride; + StorageIndex input_span; + StorageIndex output_span; + StorageIndex size; + StorageIndex count; + }; + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy( + const TensorBlock& block, StorageIndex first_coeff_index, + const array<StorageIndex, NumDims>& tensor_to_block_dim_map, + const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data, + Scalar* dst_data) { + // Find the innermost tensor dimension whose size is not 1. This is the + // effective inner dim. If all dimensions are of size 1, then fallback to + // using the actual innermost dim to avoid out-of-bound access. + StorageIndex num_size_one_inner_dims = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = cond<Layout>()(i, NumDims - i - 1); + if (block.block_sizes()[tensor_to_block_dim_map[dim]] != 1) { + num_size_one_inner_dims = i; + break; + } + } + // Calculate strides and dimensions. + const StorageIndex tensor_stride1_dim = cond<Layout>()( + num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1); + const StorageIndex block_dim_for_tensor_stride1_dim = + NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim]; + size_t block_inner_dim_size = + NumDims == 0 ? 1 + : block.block_sizes()[block_dim_for_tensor_stride1_dim]; + for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) { + const int dim = cond<Layout>()(i, NumDims - i - 1); + const StorageIndex block_stride = + block.block_strides()[tensor_to_block_dim_map[dim]]; + if (block_inner_dim_size == block_stride && + block_stride == tensor_strides[dim]) { + block_inner_dim_size *= + block.block_sizes()[tensor_to_block_dim_map[dim]]; + ++num_size_one_inner_dims; + } else { + break; + } + } + + StorageIndex inputIndex; + StorageIndex outputIndex; + StorageIndex input_stride; + StorageIndex output_stride; + + // Setup strides to read/write along the tensor's stride1 dimension. + if (BlockRead) { + inputIndex = first_coeff_index; + outputIndex = 0; + input_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim]; + output_stride = + NumDims == 0 + ? 1 + : block.block_strides()[block_dim_for_tensor_stride1_dim]; + } else { + inputIndex = 0; + outputIndex = first_coeff_index; + input_stride = + NumDims == 0 + ? 1 + : block.block_strides()[block_dim_for_tensor_stride1_dim]; + output_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim]; + } + + const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; + array<BlockIteratorState, at_least_1_dim> block_iter_state; + + // Initialize block iterator state. Squeeze away any dimension of size 1. + int num_squeezed_dims = 0; + for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { + const int dim = cond<Layout>()(i + 1, NumDims - i - 2); + const StorageIndex size = block.block_sizes()[tensor_to_block_dim_map[dim]]; + if (size == 1) { + continue; + } + block_iter_state[num_squeezed_dims].size = size; + if (BlockRead) { + block_iter_state[num_squeezed_dims].input_stride = tensor_strides[dim]; + block_iter_state[num_squeezed_dims].output_stride = + block.block_strides()[tensor_to_block_dim_map[dim]]; + } else { + block_iter_state[num_squeezed_dims].input_stride = + block.block_strides()[tensor_to_block_dim_map[dim]]; + block_iter_state[num_squeezed_dims].output_stride = tensor_strides[dim]; + } + block_iter_state[num_squeezed_dims].input_span = + block_iter_state[num_squeezed_dims].input_stride * + (block_iter_state[num_squeezed_dims].size - 1); + block_iter_state[num_squeezed_dims].output_span = + block_iter_state[num_squeezed_dims].output_stride * + (block_iter_state[num_squeezed_dims].size - 1); + block_iter_state[num_squeezed_dims].count = 0; + ++num_squeezed_dims; + } + + // Iterate copying data from src to dst. + const StorageIndex block_total_size = + NumDims == 0 ? 1 : block.block_sizes().TotalSize(); + for (StorageIndex i = 0; i < block_total_size; i += block_inner_dim_size) { + TensorBlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride, + dst_data, inputIndex, input_stride, src_data); + // Update index. + for (int j = 0; j < num_squeezed_dims; ++j) { + if (++block_iter_state[j].count < block_iter_state[j].size) { + inputIndex += block_iter_state[j].input_stride; + outputIndex += block_iter_state[j].output_stride; + break; + } + block_iter_state[j].count = 0; + inputIndex -= block_iter_state[j].input_span; + outputIndex -= block_iter_state[j].output_span; + } + } + } +}; + +/** + * \class TensorBlockReader + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor block reader class. + * + * This class is responsible for reading a tensor block. + * + */ +template <typename Scalar, typename StorageIndex, int NumDims, int Layout> +class TensorBlockReader : public TensorBlockIO<Scalar, StorageIndex, NumDims, + Layout, /*BlockRead=*/true> { + public: + typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout> + TensorBlock; + typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/true> + Base; + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + TensorBlock* block, const Scalar* src_data) { + array<StorageIndex, NumDims> tensor_to_block_dim_map; + for (int i = 0; i < NumDims; ++i) { + tensor_to_block_dim_map[i] = i; + } + Base::Copy(*block, block->first_coeff_index(), tensor_to_block_dim_map, + block->tensor_strides(), src_data, block->data()); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + TensorBlock* block, StorageIndex first_coeff_index, + const array<StorageIndex, NumDims>& tensor_to_block_dim_map, + const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data) { + Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map, + tensor_strides, src_data, block->data()); + } +}; + +/** + * \class TensorBlockWriter + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor block writer class. + * + * This class is responsible for writing a tensor block. + * + */ +template <typename Scalar, typename StorageIndex, int NumDims, int Layout> +class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims, + Layout, /*BlockRead=*/false> { + public: + typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout> + TensorBlock; + typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/false> + Base; + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const TensorBlock& block, Scalar* dst_data) { + array<StorageIndex, NumDims> tensor_to_block_dim_map; + for (int i = 0; i < NumDims; ++i) { + tensor_to_block_dim_map[i] = i; + } + Base::Copy(block, block.first_coeff_index(), tensor_to_block_dim_map, + block.tensor_strides(), block.data(), dst_data); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const TensorBlock& block, StorageIndex first_coeff_index, + const array<StorageIndex, NumDims>& tensor_to_block_dim_map, + const array<StorageIndex, NumDims>& tensor_strides, Scalar* dst_data) { + Base::Copy(block, first_coeff_index, tensor_to_block_dim_map, + tensor_strides, block.data(), dst_data); + } +}; + +/** + * \class TensorBlockCwiseBinaryOp + * \ingroup CXX11_Tensor_Module + * + * \brief Carries out a cwise binary op on a number of coefficients. + * + * This class reads strided inputs from left and right operands, and writes the + * result of the cwise binary op to the strided output array. + * + */ +struct TensorBlockCwiseBinaryOp { + template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar, + typename LeftScalar, typename RightScalar> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const BinaryFunctor& functor, const StorageIndex num_coeff, + const StorageIndex output_index, const StorageIndex output_stride, + OutputScalar* output_data, const StorageIndex left_index, + const StorageIndex left_stride, const LeftScalar* left_data, + const StorageIndex right_index, const StorageIndex right_stride, + const RightScalar* right_data) { + using Lhs = const Eigen::Array<LeftScalar, Dynamic, 1>; + using Rhs = const Eigen::Array<RightScalar, Dynamic, 1>; + using Out = Eigen::Array<OutputScalar, Dynamic, 1>; + + using LhsMap = Eigen::Map<Lhs, 0, InnerStride<>>; + using RhsMap = Eigen::Map<Rhs, 0, InnerStride<>>; + using OutMap = Eigen::Map<Out, 0, InnerStride<>>; + + const LeftScalar* lhs_base = &left_data[left_index]; + const RightScalar* rhs_base = &right_data[right_index]; + OutputScalar* out_base = &output_data[output_index]; + + const LhsMap lhs(lhs_base, num_coeff, InnerStride<>(left_stride)); + const RhsMap rhs(rhs_base, num_coeff, InnerStride<>(right_stride)); + OutMap out(out_base, num_coeff, InnerStride<>(output_stride)); + + out = + Eigen::CwiseBinaryOp<BinaryFunctor, LhsMap, RhsMap>(lhs, rhs, functor); + } +}; + +/** + * \class TensorBlockCwiseBinaryIO + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor block IO class for carrying out cwise binary ops. + * + * This class carries out the binary op on given blocks. + * + */ +template <typename BinaryFunctor, typename StorageIndex, typename OutputScalar, + int NumDims, int Layout> +struct TensorBlockCwiseBinaryIO { + typedef typename internal::TensorBlock<OutputScalar, StorageIndex, NumDims, + Layout>::Dimensions Dimensions; + + struct BlockIteratorState { + StorageIndex output_stride, output_span; + StorageIndex left_stride, left_span; + StorageIndex right_stride, right_span; + StorageIndex size, count; + }; + + template <typename LeftScalar, typename RightScalar> + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const BinaryFunctor& functor, const Dimensions& block_sizes, + const Dimensions& block_strides, OutputScalar* output_data, + const array<StorageIndex, NumDims>& left_strides, + const LeftScalar* left_data, + const array<StorageIndex, NumDims>& right_strides, + const RightScalar* right_data) { + // Find the innermost dimension whose size is not 1. This is the effective + // inner dim. If all dimensions are of size 1, fallback to using the actual + // innermost dim to avoid out-of-bound access. + int num_size_one_inner_dims = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = cond<Layout>()(i, NumDims - i - 1); + if (block_sizes[dim] != 1) { + num_size_one_inner_dims = i; + break; + } + } + // Calculate strides and dimensions. + const int inner_dim = + NumDims == 0 ? 1 + : cond<Layout>()(num_size_one_inner_dims, + NumDims - num_size_one_inner_dims - 1); + StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim]; + for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) { + const int dim = cond<Layout>()(i, NumDims - i - 1); + // Merge multiple inner dims into one for larger inner dim size (i.e. + // fewer calls to TensorBlockCwiseBinaryOp::Run()). + if (inner_dim_size == block_strides[dim] && + block_strides[dim] == left_strides[dim] && + block_strides[dim] == right_strides[dim]) { + inner_dim_size *= block_sizes[dim]; + ++num_size_one_inner_dims; + } else { + break; + } + } + + StorageIndex output_index = 0, left_index = 0, right_index = 0; + const StorageIndex output_stride = + NumDims == 0 ? 1 : block_strides[inner_dim]; + const StorageIndex left_stride = NumDims == 0 ? 1 : left_strides[inner_dim]; + const StorageIndex right_stride = + NumDims == 0 ? 1 : right_strides[inner_dim]; + + const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; + array<BlockIteratorState, at_least_1_dim> block_iter_state; + + // Initialize block iterator state. Squeeze away any dimension of size 1. + int num_squeezed_dims = 0; + for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { + const int dim = cond<Layout>()(i + 1, NumDims - i - 2); + const StorageIndex size = block_sizes[dim]; + if (size == 1) { + continue; + } + auto& state = block_iter_state[num_squeezed_dims]; + state.output_stride = block_strides[dim]; + state.left_stride = left_strides[dim]; + state.right_stride = right_strides[dim]; + state.size = size; + state.output_span = state.output_stride * (size - 1); + state.left_span = state.left_stride * (size - 1); + state.right_span = state.right_stride * (size - 1); + state.count = 0; + ++num_squeezed_dims; + } + + // Compute cwise binary op. + const StorageIndex block_total_size = + NumDims == 0 ? 1 : block_sizes.TotalSize(); + for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) { + TensorBlockCwiseBinaryOp::Run(functor, inner_dim_size, output_index, + output_stride, output_data, left_index, + left_stride, left_data, right_index, + right_stride, right_data); + // Update index. + for (int j = 0; j < num_squeezed_dims; ++j) { + auto& state = block_iter_state[j]; + if (++state.count < state.size) { + output_index += state.output_stride; + left_index += state.left_stride; + right_index += state.right_stride; + break; + } + state.count = 0; + output_index -= state.output_span; + left_index -= state.left_span; + right_index -= state.right_span; + } + } + } +}; + +/** + * \class TensorBlockView + * \ingroup CXX11_Tensor_Module + * + * \brief Read-only view into a block of data. + * + * This class provides read-only access to a block of data in impl. It may need + * to allocate space for holding the intermediate result. + * + */ +template <class ArgType, class Device> +struct TensorBlockView { + typedef TensorEvaluator<ArgType, Device> Impl; + typedef typename Impl::Index StorageIndex; + typedef typename remove_const<typename Impl::Scalar>::type Scalar; + static const int NumDims = array_size<typename Impl::Dimensions>::value; + typedef DSizes<StorageIndex, NumDims> Dimensions; + + // Constructs a TensorBlockView for `impl`. `block` is only used for for + // specifying the start offset, shape, and strides of the block. + template <typename OtherTensorBlock> + TensorBlockView(const Device& device, + const TensorEvaluator<ArgType, Device>& impl, + const OtherTensorBlock& block) + : m_device(device), + m_block_sizes(block.block_sizes()), + m_data(NULL), + m_allocated_data(NULL) { + if (Impl::RawAccess && impl.data() != NULL) { + m_data = impl.data() + block.first_coeff_index(); + m_block_strides = block.tensor_strides(); + } else { + // Actually make a copy. + + // TODO(wuke): This sometimes put a lot pressure on the heap allocator. + // Consider allowing ops to request additional temporary block memory in + // TensorOpResourceRequirements. + m_allocated_data = static_cast<Scalar*>( + m_device.allocate(m_block_sizes.TotalSize() * sizeof(Scalar))); + m_data = m_allocated_data; + if (NumDims > 0) { + if (static_cast<int>(Impl::Layout) == static_cast<int>(ColMajor)) { + m_block_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_block_strides[i] = m_block_strides[i - 1] * m_block_sizes[i - 1]; + } + } else { + m_block_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_block_strides[i] = m_block_strides[i + 1] * m_block_sizes[i + 1]; + } + } + } + TensorBlock<Scalar, StorageIndex, NumDims, Impl::Layout> input_block( + block.first_coeff_index(), m_block_sizes, m_block_strides, + block.tensor_strides(), m_allocated_data); + impl.block(&input_block); + } + } + + ~TensorBlockView() { + if (m_allocated_data != NULL) { + m_device.deallocate(m_allocated_data); + } + } + + const Dimensions& block_sizes() const { return m_block_sizes; } + const Dimensions& block_strides() const { return m_block_strides; } + const Scalar* data() const { return m_data; } + + private: + const Device& m_device; + Dimensions m_block_sizes, m_block_strides; + const Scalar* m_data; // Not owned. + Scalar* m_allocated_data; // Owned. +}; + /** * \class TensorBlockMapper * \ingroup CXX11_Tensor_Module @@ -90,21 +624,21 @@ class TensorBlock { * * This class is responsible for iterating over the blocks of a tensor. */ -template <typename Scalar, typename Index, std::size_t NumDims, int Layout> +template <typename Scalar, typename StorageIndex, int NumDims, int Layout> class TensorBlockMapper { public: - typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout> + typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout> TensorBlock; - typedef DSizes<Index, NumDims> Dimensions; + typedef DSizes<StorageIndex, NumDims> Dimensions; TensorBlockMapper(const Dimensions& dims, const TensorBlockShapeType block_shape, - size_t min_target_size) + Index min_target_size) : m_dimensions(dims), m_block_dim_sizes(BlockDimensions(dims, block_shape, min_target_size)) { // Calculate block counts by dimension and total block count. - DSizes<Index, NumDims> block_count; - for (size_t i = 0; i < block_count.rank(); ++i) { + DSizes<StorageIndex, NumDims> block_count; + for (Index i = 0; i < block_count.rank(); ++i) { block_count[i] = divup(m_dimensions[i], m_block_dim_sizes[i]); } m_total_block_count = array_prod(block_count); @@ -130,15 +664,15 @@ class TensorBlockMapper { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - GetBlockForIndex(Index block_index, Scalar* data) const { - Index first_coeff_index = 0; - DSizes<Index, NumDims> coords; - DSizes<Index, NumDims> sizes; - DSizes<Index, NumDims> strides; + GetBlockForIndex(StorageIndex block_index, Scalar* data) const { + StorageIndex first_coeff_index = 0; + DSizes<StorageIndex, NumDims> coords; + DSizes<StorageIndex, NumDims> sizes; + DSizes<StorageIndex, NumDims> strides; if (NumDims > 0) { if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { - const Index idx = block_index / m_block_strides[i]; + const StorageIndex idx = block_index / m_block_strides[i]; coords[i] = idx * m_block_dim_sizes[i]; sizes[i] = numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]); @@ -156,7 +690,7 @@ class TensorBlockMapper { } } else { for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = block_index / m_block_strides[i]; + const StorageIndex idx = block_index / m_block_strides[i]; coords[i] = idx * m_block_dim_sizes[i]; sizes[i] = numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]); @@ -181,23 +715,20 @@ class TensorBlockMapper { data); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const { return m_total_block_count; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index block_dims_total_size() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex + block_dims_total_size() const { return m_block_dim_sizes.TotalSize(); } private: - static int InnerDimIndex(Index i) { - return Layout == static_cast<int>(ColMajor) ? i : NumDims - i - 1; - } - static Dimensions BlockDimensions(const Dimensions& tensor_dims, const TensorBlockShapeType block_shape, - size_t min_target_size) { - min_target_size = numext::maxi<size_t>(1, min_target_size); + Index min_target_size) { + min_target_size = numext::maxi<Index>(1, min_target_size); // If tensor fully fits into the target size, we'll treat it a single block. Dimensions block_dim_sizes = tensor_dims; @@ -226,14 +757,14 @@ class TensorBlockMapper { dim_size_target, static_cast<size_t>(tensor_dims[i])); } // Add any un-allocated coefficients to inner dimension(s). - Index total_size = block_dim_sizes.TotalSize(); + StorageIndex total_size = block_dim_sizes.TotalSize(); for (int i = 0; i < NumDims; ++i) { - const int dim = InnerDimIndex(i); + const int dim = cond<Layout>()(i, NumDims - i - 1); if (block_dim_sizes[dim] < tensor_dims[dim]) { - const Index total_size_other_dims = + const StorageIndex total_size_other_dims = total_size / block_dim_sizes[dim]; - const Index alloc_avail = - divup<Index>(min_target_size, total_size_other_dims); + const StorageIndex alloc_avail = + divup<StorageIndex>(min_target_size, total_size_other_dims); if (alloc_avail == block_dim_sizes[dim]) { // Insufficient excess coefficients to allocate. break; @@ -243,14 +774,14 @@ class TensorBlockMapper { } } } else if (block_shape == TensorBlockShapeType::kSkewedInnerDims) { - Index coeff_to_allocate = min_target_size; + StorageIndex coeff_to_allocate = min_target_size; for (int i = 0; i < NumDims; ++i) { - const int dim = InnerDimIndex(i); + const int dim = cond<Layout>()(i, NumDims - i - 1); block_dim_sizes[dim] = numext::mini(coeff_to_allocate, tensor_dims[dim]); - coeff_to_allocate = - divup(coeff_to_allocate, - numext::maxi(static_cast<Index>(1), block_dim_sizes[dim])); + coeff_to_allocate = divup( + coeff_to_allocate, + numext::maxi(static_cast<StorageIndex>(1), block_dim_sizes[dim])); } eigen_assert(coeff_to_allocate == 1); } else { @@ -269,7 +800,7 @@ class TensorBlockMapper { Dimensions m_block_dim_sizes; Dimensions m_block_strides; Dimensions m_tensor_strides; - Index m_total_block_count; + StorageIndex m_total_block_count; }; /** @@ -284,12 +815,12 @@ class TensorBlockMapper { * processed together. * */ -template <typename Scalar, typename Index, std::size_t NumDims, int Layout> +template <typename Scalar, typename StorageIndex, int NumDims, int Layout> class TensorSliceBlockMapper { public: - typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout> + typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout> TensorBlock; - typedef DSizes<Index, NumDims> Dimensions; + typedef DSizes<StorageIndex, NumDims> Dimensions; TensorSliceBlockMapper(const Dimensions& tensor_dims, const Dimensions& tensor_slice_offsets, @@ -303,7 +834,7 @@ class TensorSliceBlockMapper { m_block_stride_order(block_stride_order), m_total_block_count(1) { // Calculate block counts by dimension and total block count. - DSizes<Index, NumDims> block_count; + DSizes<StorageIndex, NumDims> block_count; for (size_t i = 0; i < block_count.rank(); ++i) { block_count[i] = divup(m_tensor_slice_extents[i], m_block_dim_sizes[i]); } @@ -330,11 +861,11 @@ class TensorSliceBlockMapper { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - GetBlockForIndex(Index block_index, Scalar* data) const { - Index first_coeff_index = 0; - DSizes<Index, NumDims> coords; - DSizes<Index, NumDims> sizes; - DSizes<Index, NumDims> strides; + GetBlockForIndex(StorageIndex block_index, Scalar* data) const { + StorageIndex first_coeff_index = 0; + DSizes<StorageIndex, NumDims> coords; + DSizes<StorageIndex, NumDims> sizes; + DSizes<StorageIndex, NumDims> strides; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx = block_index / m_block_strides[i]; @@ -352,16 +883,16 @@ class TensorSliceBlockMapper { m_block_dim_sizes[0]); first_coeff_index += coords[0] * m_tensor_strides[0]; - Index prev_dim = m_block_stride_order[0]; + StorageIndex prev_dim = m_block_stride_order[0]; strides[prev_dim] = 1; for (int i = 1; i < NumDims; ++i) { - const Index curr_dim = m_block_stride_order[i]; + const StorageIndex curr_dim = m_block_stride_order[i]; strides[curr_dim] = strides[prev_dim] * sizes[prev_dim]; prev_dim = curr_dim; } } else { - for (int i = 0; i < static_cast<int>(NumDims) - 1; ++i) { - const Index idx = block_index / m_block_strides[i]; + for (int i = 0; i < NumDims - 1; ++i) { + const StorageIndex idx = block_index / m_block_strides[i]; coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i]; sizes[i] = numext::mini( m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i], @@ -377,10 +908,10 @@ class TensorSliceBlockMapper { m_block_dim_sizes[NumDims - 1]); first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1]; - Index prev_dim = m_block_stride_order[NumDims - 1]; + StorageIndex prev_dim = m_block_stride_order[NumDims - 1]; strides[prev_dim] = 1; for (int i = NumDims - 2; i >= 0; --i) { - const Index curr_dim = m_block_stride_order[i]; + const StorageIndex curr_dim = m_block_stride_order[i]; strides[curr_dim] = strides[prev_dim] * sizes[prev_dim]; prev_dim = curr_dim; } @@ -390,7 +921,7 @@ class TensorSliceBlockMapper { data); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const { return m_total_block_count; } @@ -402,7 +933,7 @@ class TensorSliceBlockMapper { Dimensions m_block_dim_sizes; Dimensions m_block_stride_order; Dimensions m_block_strides; - Index m_total_block_count; + StorageIndex m_total_block_count; }; } // namespace internal diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index e647b3609..8fecbe657 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -110,6 +110,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> enum { IsAligned = true, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 21ffa2872..085c05f3d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -146,6 +146,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> // slice offsets. IsAligned = false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -343,6 +344,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index a7c1380b8..9f0321880 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -122,6 +122,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy enum { IsAligned = false, PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<LeftArgType, Device>::Layout, RawAccess = false }; @@ -306,6 +307,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De enum { IsAligned = false, PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<LeftArgType, Device>::Layout, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 57b5339d1..86602c27e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -240,6 +240,7 @@ struct TensorContractionEvaluatorBase enum { IsAligned = true, PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1), + BlockAccess = false, Layout = TensorEvaluator<LeftArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 182bef918..e0cbbb315 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -195,6 +195,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> enum { IsAligned = false, PacketAccess = true, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 25131600d..1ec5819a7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -307,6 +307,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr enum { IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned, PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<InputArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -577,11 +578,11 @@ __global__ void EigenConvolutionKernel1D( const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) { #if defined(EIGEN_HIPCC) - HIP_DYNAMIC_SHARED(float, s) + HIP_DYNAMIC_SHARED(float, s) #else extern __shared__ float s[]; #endif - + const int first_x = blockIdx.x * maxX; const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize); @@ -630,7 +631,7 @@ __global__ void EigenConvolutionKernel2D( const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) { #if defined(EIGEN_HIPCC) - HIP_DYNAMIC_SHARED(float, s) + HIP_DYNAMIC_SHARED(float, s) #else extern __shared__ float s[]; #endif @@ -702,7 +703,7 @@ __global__ void EigenConvolutionKernel3D( const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) { #if defined(EIGEN_HIPCC) - HIP_DYNAMIC_SHARED(float, s) + HIP_DYNAMIC_SHARED(float, s) #else extern __shared__ float s[]; #endif @@ -778,6 +779,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr enum { IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned, PacketAccess = false, + BlockAccess = false, Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 65403905a..d301d0c01 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -242,6 +242,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr enum { IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned, PacketAccess = false, + BlockAccess = false, Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 192d4aa7b..5ca47cca7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -290,6 +290,22 @@ struct DSizes : array<DenseIndex, NumDims> { } } +#ifndef EIGEN_EMULATE_CXX11_META_H + template <typename std::ptrdiff_t... Indices> + EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } +#else + template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> + EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } +#endif + #if EIGEN_HAS_VARIADIC_TEMPLATES template<typename... IndexTypes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index d0c027890..af39daa91 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -107,6 +107,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device> enum { IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index fe62ff1ea..f9a1bd68c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -41,11 +41,22 @@ struct TensorEvaluator enum { IsAligned = Derived::IsAligned, PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1), + BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value, Layout = Derived::Layout, CoordAccess = NumCoords > 0, RawAccess = true }; + typedef typename internal::TensorBlock< + typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout> + TensorBlock; + typedef typename internal::TensorBlockReader< + typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout> + TensorBlockReader; + typedef typename internal::TensorBlockWriter< + typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout> + TensorBlockWriter; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) : m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m) { } @@ -113,6 +124,20 @@ struct TensorEvaluator internal::unpacket_traits<PacketReturnType>::size); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const { + assert(m_data != NULL); + TensorBlockReader::Run(block, m_data); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlock& block) { + assert(m_data != NULL); + TensorBlockWriter::Run(block, m_data); + } + EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; } /// required by sycl in order to construct sycl buffer from raw pointer @@ -167,11 +192,19 @@ struct TensorEvaluator<const Derived, Device> enum { IsAligned = Derived::IsAligned, PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1), + BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value, Layout = Derived::Layout, CoordAccess = NumCoords > 0, RawAccess = true }; + typedef typename internal::TensorBlock< + typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout> + TensorBlock; + typedef typename internal::TensorBlockReader< + typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout> + TensorBlockReader; + // Used for accessor extraction in SYCL Managed TensorMap: const Derived& derived() const { return m_impl; } @@ -219,6 +252,14 @@ struct TensorEvaluator<const Derived, Device> internal::unpacket_traits<PacketReturnType>::size); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const { + assert(m_data != NULL); + TensorBlockReader::Run(block, m_data); + } + EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; } /// added for sycl in order to construct the buffer from the sycl device @@ -244,6 +285,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> enum { IsAligned = true, PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -308,7 +350,9 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device> enum { IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & + internal::functor_traits<UnaryOp>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -375,16 +419,21 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType; enum { - IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess & + IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & + TensorEvaluator<RightArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & + TensorEvaluator<RightArgType, Device>::PacketAccess & internal::functor_traits<BinaryOp>::PacketAccess, - Layout = TensorEvaluator<LeftArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess & + TensorEvaluator<RightArgType, Device>::BlockAccess, + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), + : m_device(device), + m_functor(op.functor()), m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { @@ -399,6 +448,14 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions; + static const int NumDims = internal::array_size< + typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value; + + typedef internal::TensorBlock< + typename internal::remove_const<Scalar>::type, Index, NumDims, + TensorEvaluator<LeftArgType, Device>::Layout> + TensorBlock; + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { // TODO: use right impl instead if right impl dimensions are known at compile time. @@ -433,6 +490,30 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + m_leftImpl.getResourceRequirements(resources); + m_rightImpl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + if (NumDims <= 0) { + output_block->data()[0] = coeff(0); + return; + } + internal::TensorBlockView<LeftArgType, Device> left_block( + m_device, m_leftImpl, *output_block); + internal::TensorBlockView<RightArgType, Device> right_block( + m_device, m_rightImpl, *output_block); + internal::TensorBlockCwiseBinaryIO< + BinaryOp, Index, typename internal::remove_const<Scalar>::type, NumDims, + Layout>::Run(m_functor, output_block->block_sizes(), + output_block->block_strides(), output_block->data(), + left_block.block_strides(), left_block.data(), + right_block.block_strides(), right_block.data()); + } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; } @@ -442,6 +523,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg BinaryOp functor() const { return m_functor; } private: + const Device& m_device; const BinaryOp m_functor; TensorEvaluator<LeftArgType, Device> m_leftImpl; TensorEvaluator<RightArgType, Device> m_rightImpl; @@ -458,6 +540,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned, PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess & internal::functor_traits<TernaryOp>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<Arg1Type, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -562,6 +645,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned, PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess & internal::packet_traits<Scalar>::HasBlend, + BlockAccess = false, Layout = TensorEvaluator<IfArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 53640c6aa..ac5afd891 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -12,31 +12,40 @@ namespace Eigen { -/** \class TensorExecutor - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor executor class. - * - * This class is responsible for launch the evaluation of the expression on - * the specified computing device. - */ +/** + * \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + * + * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and + * instructions) + * @tparam Tileable can use block based tensor evaluation + * (see TensorBlock.h) + */ namespace internal { -// Default strategy: the expression is evaluated with a single cpu thread. -template<typename Expression, typename Device, bool Vectorizable> -class TensorExecutor -{ +/** + * Default strategy: the expression is evaluated sequentially with a single cpu + * thread, without vectorization and block evaluation. + */ +template <typename Expression, typename Device, bool Vectorizable, + bool Tileable> +class TensorExecutor { public: - typedef typename Expression::Index Index; + using StorageIndex = typename Expression::Index; + EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const Device& device = Device()) - { + static inline void run(const Expression& expr, + const Device& device = Device()) { TensorEvaluator<Expression, Device> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); - for (Index i = 0; i < size; ++i) { + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); + for (StorageIndex i = 0; i < size; ++i) { evaluator.evalScalar(i); } } @@ -44,35 +53,40 @@ class TensorExecutor } }; - -template<typename Expression> -class TensorExecutor<Expression, DefaultDevice, true> -{ +/** + * Process all the data with a single cpu thread, using vectorized instructions. + */ +template <typename Expression> +class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true, + /*Tileable*/ false> { public: - typedef typename Expression::Index Index; + using StorageIndex = typename Expression::Index; + EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) - { + static inline void run(const Expression& expr, + const DefaultDevice& device = DefaultDevice()) { TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); - const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); + const int PacketSize = unpacket_traits<typename TensorEvaluator< + Expression, DefaultDevice>::PacketReturnType>::size; + + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. - const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; - for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { + const StorageIndex UnrolledSize = + (size / (4 * PacketSize)) * 4 * PacketSize; + for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { evaluator.evalPacket(i + j * PacketSize); } } - const Index VectorizedSize = (size / PacketSize) * PacketSize; - for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { + const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize; + for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) { evaluator.evalPacket(i); } - for (Index i = VectorizedSize; i < size; ++i) { + for (StorageIndex i = VectorizedSize; i < size; ++i) { evaluator.evalScalar(i); } } @@ -80,41 +94,107 @@ class TensorExecutor<Expression, DefaultDevice, true> } }; +/** + * Process all the data with a single cpu thread, using blocks of data. By + * sizing a block to fit L1 cache we get better cache performance. + */ +template <typename Expression, bool Vectorizable> +class TensorExecutor<Expression, DefaultDevice, Vectorizable, + /*Tileable*/ true> { + public: + using Scalar = typename traits<Expression>::Scalar; + using ScalarNoConst = typename remove_const<Scalar>::type; + + using Evaluator = TensorEvaluator<Expression, DefaultDevice>; + using StorageIndex = typename traits<Expression>::Index; + + static const int NumDims = traits<Expression>::NumDimensions; + + EIGEN_DEVICE_FUNC + static inline void run(const Expression& expr, + const DefaultDevice& device = DefaultDevice()) { + using TensorBlock = + TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>; + using TensorBlockMapper = TensorBlockMapper<ScalarNoConst, StorageIndex, + NumDims, Evaluator::Layout>; + Evaluator evaluator(expr, device); + Index total_size = array_prod(evaluator.dimensions()); + Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + + if (total_size < cache_size) { + // TODO(andydavis) Reduce block management overhead for small tensors. + // TODO(wuke) Do not do this when evaluating TensorBroadcastingOp. + internal::TensorExecutor<Expression, DefaultDevice, Vectorizable, + /*Tileable*/ false>::run(expr, device); + return; + } -// Multicore strategy: the index space is partitioned and each partition is executed on a single core + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + // Size tensor blocks to fit in cache (or requested target block size). + Index block_total_size = numext::mini(cache_size, total_size); + TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims; + // Query expression tree for desired block size/shape. + std::vector<TensorOpResourceRequirements> resources; + evaluator.getResourceRequirements(&resources); + MergeResourceRequirements(resources, &block_shape, &block_total_size); + + TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape, + block_total_size); + block_total_size = block_mapper.block_dims_total_size(); + + Scalar* data = static_cast<Scalar*>( + device.allocate(block_total_size * sizeof(Scalar))); + + const StorageIndex total_block_count = block_mapper.total_block_count(); + for (StorageIndex i = 0; i < total_block_count; ++i) { + TensorBlock block = block_mapper.GetBlockForIndex(i, data); + evaluator.evalBlock(&block); + } + device.deallocate(data); + } + evaluator.cleanup(); + } +}; + +/** + * Multicore strategy: the index space is partitioned and each partition is + * executed on a single core. + */ #ifdef EIGEN_USE_THREADS -template <typename Evaluator, typename Index, bool Vectorizable> +template <typename Evaluator, typename StorageIndex, bool Vectorizable> struct EvalRange { - static void run(Evaluator* evaluator_in, const Index first, const Index last) { + static void run(Evaluator* evaluator_in, const StorageIndex first, + const StorageIndex last) { Evaluator evaluator = *evaluator_in; eigen_assert(last >= first); - for (Index i = first; i < last; ++i) { + for (StorageIndex i = first; i < last; ++i) { evaluator.evalScalar(i); } } - static Index alignBlockSize(Index size) { - return size; - } + static StorageIndex alignBlockSize(StorageIndex size) { return size; } }; -template <typename Evaluator, typename Index> -struct EvalRange<Evaluator, Index, true> { - static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; +template <typename Evaluator, typename StorageIndex> +struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> { + static const int PacketSize = + unpacket_traits<typename Evaluator::PacketReturnType>::size; - static void run(Evaluator* evaluator_in, const Index first, const Index last) { + static void run(Evaluator* evaluator_in, const StorageIndex first, + const StorageIndex last) { Evaluator evaluator = *evaluator_in; eigen_assert(last >= first); - Index i = first; + StorageIndex i = first; if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); - Index last_chunk_offset = last - 4 * PacketSize; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not + StorageIndex last_chunk_offset = last - 4 * PacketSize; + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. - for (; i <= last_chunk_offset; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { + for (; i <= last_chunk_offset; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { evaluator.evalPacket(i + j * PacketSize); } } @@ -128,7 +208,7 @@ struct EvalRange<Evaluator, Index, true> { } } - static Index alignBlockSize(Index size) { + static StorageIndex alignBlockSize(StorageIndex size) { // Align block size to packet size and account for unrolling in run above. if (size >= 16 * PacketSize) { return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); @@ -138,106 +218,165 @@ struct EvalRange<Evaluator, Index, true> { } }; -template <typename Expression, bool Vectorizable> -class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { +template <typename Expression, bool Vectorizable, bool Tileable> +class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> { public: - typedef typename Expression::Index Index; - static inline void run(const Expression& expr, const ThreadPoolDevice& device) - { + using StorageIndex = typename Expression::Index; + + static inline void run(const Expression& expr, + const ThreadPoolDevice& device) { typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; + typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange; + Evaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); - size_t num_threads = device.numThreads(); - if (num_threads > 1) { - num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( - size, evaluator.costPerCoeff(Vectorizable), num_threads); - } - if (num_threads == 1) { - EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size); - } else { - const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; - Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1; - const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; - - Barrier barrier(numblocks); - for (int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier( - &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, - &evaluator, i * blocksize, (i + 1) * blocksize); - } - if (numblocks * blocksize < size) { - EvalRange<Evaluator, Index, Vectorizable>::run( - &evaluator, numblocks * blocksize, size); - } - barrier.Wait(); - } + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const StorageIndex PacketSize = + Vectorizable + ? unpacket_traits<typename Evaluator::PacketReturnType>::size + : 1; + const StorageIndex size = array_prod(evaluator.dimensions()); + device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), + EvalRange::alignBlockSize, + [&evaluator](StorageIndex first, StorageIndex last) { + EvalRange::run(&evaluator, first, last); + }); + } + evaluator.cleanup(); + } +}; + +template <typename Expression, bool Vectorizable> +class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> { + public: + using Scalar = typename traits<Expression>::Scalar; + using ScalarNoConst = typename remove_const<Scalar>::type; + + using Evaluator = TensorEvaluator<Expression, ThreadPoolDevice>; + using StorageIndex = typename traits<Expression>::Index; + + static const int NumDims = traits<Expression>::NumDimensions; + + static inline void run(const Expression& expr, + const ThreadPoolDevice& device) { + using TensorBlock = + TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>; + using TensorBlockMapper = + TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>; + + Evaluator evaluator(expr, device); + StorageIndex total_size = array_prod(evaluator.dimensions()); + StorageIndex cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + if (total_size < cache_size) { + // TODO(andydavis) Reduce block management overhead for small tensors. + internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, + false>::run(expr, device); + evaluator.cleanup(); + return; + } + + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims; + Index block_total_size = 0; + // Query expression tree for desired block size/shape. + std::vector<internal::TensorOpResourceRequirements> resources; + evaluator.getResourceRequirements(&resources); + MergeResourceRequirements(resources, &block_shape, &block_total_size); + int num_threads = device.numThreads(); + + // Estimate minimum block size based on cost. + TensorOpCost cost = evaluator.costPerCoeff(Vectorizable); + double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost); + size_t block_size = static_cast<size_t>(1.0 / taskSize); + TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape, + block_size); + block_size = block_mapper.block_dims_total_size(); + const size_t aligned_blocksize = + EIGEN_MAX_ALIGN_BYTES * + divup<size_t>(block_size * sizeof(Scalar), EIGEN_MAX_ALIGN_BYTES); + void* buf = device.allocate((num_threads + 1) * aligned_blocksize); + device.parallelFor( + block_mapper.total_block_count(), cost * block_size, + [=, &device, &evaluator, &block_mapper](StorageIndex first, + StorageIndex last) { + // currentThreadId() returns -1 if called from a thread not in the + // thread pool, such as the main thread dispatching Eigen + // expressions. + const int thread_idx = device.currentThreadId(); + eigen_assert(thread_idx >= -1 && thread_idx < num_threads); + Scalar* thread_buf = reinterpret_cast<Scalar*>( + static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1)); + for (StorageIndex i = first; i < last; ++i) { + auto block = block_mapper.GetBlockForIndex(i, thread_buf); + evaluator.evalBlock(&block); + } + }); + device.deallocate(buf); } evaluator.cleanup(); } }; + #endif // EIGEN_USE_THREADS // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) -template <typename Expression, bool Vectorizable> -class TensorExecutor<Expression, GpuDevice, Vectorizable> { +template <typename Expression, bool Vectorizable, bool Tileable> +class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> { public: - typedef typename Expression::Index Index; + typedef typename Expression::Index StorageIndex; static void run(const Expression& expr, const GpuDevice& device); }; #if defined(EIGEN_GPUCC) -template <typename Evaluator, typename Index, bool Vectorizable> +template <typename Evaluator, typename StorageIndex, bool Vectorizable> struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - for (Index i = first; i < last; i += step_size) { + void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) { + for (StorageIndex i = first; i < last; i += step_size) { eval.evalScalar(i); } } }; -template <typename Evaluator, typename Index> -struct EigenMetaKernelEval<Evaluator, Index, true> { +template <typename Evaluator, typename StorageIndex> +struct EigenMetaKernelEval<Evaluator, StorageIndex, true> { static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; - const Index vectorized_size = (last / PacketSize) * PacketSize; - const Index vectorized_step_size = step_size * PacketSize; + void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) { + const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; + const StorageIndex vectorized_size = (last / PacketSize) * PacketSize; + const StorageIndex vectorized_step_size = step_size * PacketSize; // Use the vector path - for (Index i = first * PacketSize; i < vectorized_size; + for (StorageIndex i = first * PacketSize; i < vectorized_size; i += vectorized_step_size) { eval.evalPacket(i); } - for (Index i = vectorized_size + first; i < last; i += step_size) { + for (StorageIndex i = vectorized_size + first; i < last; i += step_size) { eval.evalScalar(i); } } }; -template <typename Evaluator, typename Index> +template <typename Evaluator, typename StorageIndex> __global__ void __launch_bounds__(1024) -EigenMetaKernel(Evaluator eval, Index size) { +EigenMetaKernel(Evaluator eval, StorageIndex size) { - const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; - const Index step_size = blockDim.x * gridDim.x; + const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x; + const StorageIndex step_size = blockDim.x * gridDim.x; const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; - EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size); + EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size); } /*static*/ -template <typename Expression, bool Vectorizable> -inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run( +template <typename Expression, bool Vectorizable, bool Tileable> +inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run( const Expression& expr, const GpuDevice& device) { TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); @@ -246,12 +385,12 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run( const int block_size = device.maxGpuThreadsPerBlock(); const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size; - const Index size = array_prod(evaluator.dimensions()); + const StorageIndex size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1); LAUNCH_GPU_KERNEL( - (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), + (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index e943757ad..1342e47a6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -40,6 +40,8 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, enum { IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0), + PacketAccess = (internal::packet_traits<Scalar>::size > 1), + BlockAccess = false, Layout = Options_ & RowMajor ? RowMajor : ColMajor, CoordAccess = true, RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index b8f0bc798..fdb31928f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -98,6 +98,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> enum { IsAligned = true, PacketAccess = (PacketSize > 1), + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, RawAccess = true }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 19e456e19..8ed1796df 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -129,8 +129,14 @@ struct IsVectorizable<GpuDevice, Expression> { TensorEvaluator<Expression, GpuDevice>::IsAligned; }; +template <typename Device, typename Expression> +struct IsTileable { + static const bool value = TensorEvaluator<Expression, Device>::BlockAccess; +}; + template <typename Expression, typename Device, - bool Vectorizable = IsVectorizable<Device, Expression>::value> + bool Vectorizable = IsVectorizable<Device, Expression>::value, + bool Tileable = IsTileable<Device, Expression>::value> class TensorExecutor; } // end namespace internal diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index f0f7c7826..72cb2d15f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -186,6 +186,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index 4e384f9b9..e3165fa10 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -119,6 +119,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> enum { IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator<ArgType, Device>::RawAccess @@ -181,6 +182,7 @@ template<typename ArgType, typename Device> enum { IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false // to be implemented }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index cda49f8fe..498488649 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -105,6 +105,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> enum { IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator<ArgType, Device>::RawAccess @@ -170,6 +171,7 @@ template<typename NewDimensions, typename ArgType, typename Device> enum { IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator<ArgType, Device>::RawAccess @@ -325,6 +327,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi // slice offsets and sizes. IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, RawAccess = false @@ -557,6 +560,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> enum { IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess @@ -716,7 +720,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, static const int NumDims = internal::array_size<Strides>::value; typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const<Scalar>::type ScalarNonConst; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef Strides Dimensions; @@ -858,7 +861,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, } return inputIndex; } - + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { #ifndef __SYCL_DEVICE_ONLY__ return numext::maxi(min, numext::mini(max,value)); @@ -907,7 +910,6 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const<Scalar>::type ScalarNonConst; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef Strides Dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 5956e513d..ffa22f31e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -96,6 +96,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device enum { IsAligned = true, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = true, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 9e0a20abf..950ac32af 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -94,6 +94,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index ce573d730..375fc0802 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -412,6 +412,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, enum { IsAligned = false, PacketAccess = Self::InputPacketAccess && Op::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index b2b4fd8d3..a6cade50f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -136,6 +136,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef enum { IsAligned = false, PacketAccess = false, + BlockAccess = false, Layout = PlainObjectType::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -364,6 +365,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device> enum { IsAligned = false, PacketAccess = false, + BlockAccess = false, Layout = TensorRef<Derived>::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -411,6 +413,7 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons enum { IsAligned = false, PacketAccess = false, + BlockAccess = false, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 14a50a029..bb2768ab1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -113,6 +113,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device enum { IsAligned = false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -253,6 +254,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 0697fd1ce..6b54f40ad 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> enum { IsAligned = false, PacketAccess = (internal::packet_traits<Scalar>::size > 1), + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -240,6 +241,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device> enum { IsAligned = false, PacketAccess = (internal::packet_traits<Scalar>::size > 1), + BlockAccess = false, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index a7eea99b6..c09513c10 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> enum { IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -273,6 +274,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> enum { IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h index 2b1968de1..c8b2fad1e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, RawAccess = false @@ -110,7 +111,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device> for (int i = 0; i < NumInputDims; ++i) { m_reduced[i] = false; } - + const Dims& op_dims = op.dims(); for (int i = 0; i < NumReducedDims; ++i) { eigen_assert(op_dims[i] >= 0); @@ -128,7 +129,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device> eigen_assert(num_distinct_reduce_dims == NumReducedDims); - // Compute the dimensions of the result. + // Compute the dimensions of the result. const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); int output_index = 0; @@ -229,7 +230,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device> result += m_impl.coeff(cur_index); cur_index += index_stride; } - + return result; } diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index fa19b2159..239a80926 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -213,6 +213,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_striding) ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}") + ei_add_test(cxx11_tensor_executor "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_tensor_ref) ei_add_test(cxx11_tensor_random) ei_add_test(cxx11_tensor_generator) diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp index 66e61aef1..6feeff231 100644 --- a/unsupported/test/cxx11_tensor_block_access.cpp +++ b/unsupported/test/cxx11_tensor_block_access.cpp @@ -19,11 +19,75 @@ using Eigen::Index; using Eigen::RowMajor; using Eigen::ColMajor; +using internal::TensorBlockShapeType; + template<typename T> static const T& choose(int layout, const T& col, const T& row) { return layout == ColMajor ? col : row; } +static const TensorBlockShapeType RandomShape() { + return internal::random<bool>() + ? internal::TensorBlockShapeType::kUniformAllDims + : internal::TensorBlockShapeType::kSkewedInnerDims; +} + +template <int NumDims> +static std::size_t RandomTargetSize(const DSizes<Index, NumDims>& dims) { + return internal::random<int>(1, dims.TotalSize()); +} + +template <int NumDims> +static DSizes<Index, NumDims> RandomDims() { + array<Index, NumDims> dims; + for (int i = 0; i < NumDims; ++i) { + dims[i] = internal::random<int>(1, 20); + } + return DSizes<Index, NumDims>(dims); +}; + +/** Dummy data type to test TensorBlock copy ops. */ +struct Data { + Data() : Data(0) {} + explicit Data(int v) { value = v; } + int value; +}; + +bool operator==(const Data& lhs, const Data& rhs) { + return lhs.value == rhs.value; +} + +std::ostream& operator<<(std::ostream& os, const Data& d) { + os << "Data: value=" << d.value; + return os; +} + +template <typename T> +static T* GenerateRandomData(const Index& size) { + T* data = new T[size]; + for (int i = 0; i < size; ++i) { + data[i] = internal::random<T>(); + } + return data; +} + +template <> +Data* GenerateRandomData(const Index& size) { + Data* data = new Data[size]; + for (int i = 0; i < size; ++i) { + data[i] = Data(internal::random<int>(1, 100)); + } + return data; +} + +template <int NumDims> +static void Debug(DSizes<Index, NumDims> dims) { + for (int i = 0; i < NumDims; ++i) { + std::cout << dims[i] << "; "; + } + std::cout << std::endl; +} + template <int Layout> static void test_block_mapper_sanity() { @@ -74,10 +138,8 @@ static void test_block_mapper_sanity() // index in the visited set. Verify that every coeff accessed only once. template <typename T, int Layout, int NumDims> static void UpdateCoeffSet( - const internal::TensorBlock<T, Index, 4, Layout>& block, - Index first_coeff_index, - int dim_index, - std::set<Index>* visited_coeffs) { + const internal::TensorBlock<T, Index, NumDims, Layout>& block, + Index first_coeff_index, int dim_index, std::set<Index>* visited_coeffs) { const DSizes<Index, NumDims> block_sizes = block.block_sizes(); const DSizes<Index, NumDims> tensor_strides = block.tensor_strides(); @@ -94,89 +156,840 @@ static void UpdateCoeffSet( } } -template <int Layout> -static void test_block_mapper_maps_every_element() -{ - using T = int; - using TensorBlock = internal::TensorBlock<T, Index, 4, Layout>; - using TensorBlockMapper = internal::TensorBlockMapper<T, Index, 4, Layout>; - - DSizes<Index, 4> dims(5, 7, 11, 17); +template <typename T, int NumDims, int Layout> +static void test_block_mapper_maps_every_element() { + using TensorBlock = internal::TensorBlock<T, Index, NumDims, Layout>; + using TensorBlockMapper = + internal::TensorBlockMapper<T, Index, NumDims, Layout>; - auto total_coeffs = static_cast<int>(dims.TotalSize()); + DSizes<Index, NumDims> dims = RandomDims<NumDims>(); // Keep track of elements indices available via block access. std::set<Index> coeff_set; // Try different combinations of block types and sizes. - auto block_shape_type = - internal::random<bool>() - ? internal::TensorBlockShapeType::kUniformAllDims - : internal::TensorBlockShapeType::kSkewedInnerDims; - auto block_target_size = internal::random<int>(1, total_coeffs); - TensorBlockMapper block_mapper(dims, block_shape_type, block_target_size); + TensorBlockMapper block_mapper(dims, RandomShape(), RandomTargetSize(dims)); for (int i = 0; i < block_mapper.total_block_count(); ++i) { TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr); - UpdateCoeffSet<T, Layout, 4>(block, block.first_coeff_index(), - choose(Layout, 3, 0), &coeff_set); + UpdateCoeffSet<T, Layout, NumDims>(block, block.first_coeff_index(), + choose(Layout, NumDims - 1, 0), + &coeff_set); } // Verify that every coefficient in the original Tensor is accessible through // TensorBlock only once. + Index total_coeffs = dims.TotalSize(); VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs); - VERIFY_IS_EQUAL(*coeff_set.begin(), static_cast<Index>(0)); - VERIFY_IS_EQUAL(*coeff_set.rbegin(), static_cast<Index>(total_coeffs - 1)); + VERIFY_IS_EQUAL(*coeff_set.begin(), 0); + VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1); } -template <int Layout> -static void test_slice_block_mapper_maps_every_element() -{ - using T = int; - using TensorBlock = internal::TensorBlock<T, Index, 4, Layout>; +template <typename T, int NumDims, int Layout> +static void test_slice_block_mapper_maps_every_element() { + using TensorBlock = internal::TensorBlock<T, Index, NumDims, Layout>; using TensorSliceBlockMapper = - internal::TensorSliceBlockMapper<T, Index, 4, Layout>; - - DSizes<Index, 4> tensor_dims(5,7,11,17); - DSizes<Index, 4> tensor_slice_offsets(1,3,5,7); - DSizes<Index, 4> tensor_slice_extents(3,2,4,5); + internal::TensorSliceBlockMapper<T, Index, NumDims, Layout>; + + DSizes<Index, NumDims> tensor_dims = RandomDims<NumDims>(); + DSizes<Index, NumDims> tensor_slice_offsets = RandomDims<NumDims>(); + DSizes<Index, NumDims> tensor_slice_extents = RandomDims<NumDims>(); + + // Make sure that tensor offsets + extents do not overflow. + for (int i = 0; i < NumDims; ++i) { + tensor_slice_offsets[i] = + numext::mini(tensor_dims[i] - 1, tensor_slice_offsets[i]); + tensor_slice_extents[i] = numext::mini( + tensor_slice_extents[i], tensor_dims[i] - tensor_slice_offsets[i]); + } // Keep track of elements indices available via block access. std::set<Index> coeff_set; auto total_coeffs = static_cast<int>(tensor_slice_extents.TotalSize()); - // Try different combinations of block types and sizes. - auto block_shape_type = - internal::random<bool>() - ? internal::TensorBlockShapeType::kUniformAllDims - : internal::TensorBlockShapeType::kSkewedInnerDims; - auto block_target_size = internal::random<int>(1, total_coeffs); - // Pick a random dimension sizes for the tensor blocks. - DSizes<Index, 4> block_sizes; - for (int i = 0; i < 4; ++i) { + DSizes<Index, NumDims> block_sizes; + for (int i = 0; i < NumDims; ++i) { block_sizes[i] = internal::random<int>(1, tensor_slice_extents[i]); } TensorSliceBlockMapper block_mapper(tensor_dims, tensor_slice_offsets, tensor_slice_extents, block_sizes, - DimensionList<Index, 4>()); + DimensionList<Index, NumDims>()); for (int i = 0; i < block_mapper.total_block_count(); ++i) { - TensorBlock block = block_mapper.GetBlockForIndex(i, NULL); - UpdateCoeffSet<T, Layout, 4>(block, block.first_coeff_index(), - choose(Layout, 3, 0), &coeff_set); + TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr); + UpdateCoeffSet<T, Layout, NumDims>(block, block.first_coeff_index(), + choose(Layout, NumDims - 1, 0), + &coeff_set); } VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs); } -EIGEN_DECLARE_TEST(cxx11_tensor_assign) { - CALL_SUBTEST(test_block_mapper_sanity<ColMajor>()); - CALL_SUBTEST(test_block_mapper_sanity<RowMajor>()); - CALL_SUBTEST(test_block_mapper_maps_every_element<ColMajor>()); - CALL_SUBTEST(test_block_mapper_maps_every_element<RowMajor>()); - CALL_SUBTEST(test_slice_block_mapper_maps_every_element<ColMajor>()); - CALL_SUBTEST(test_slice_block_mapper_maps_every_element<RowMajor>()); +template <typename T, int NumDims, int Layout> +static void test_block_io_copy_data_from_source_to_target() { + typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock; + typedef internal::TensorBlockMapper<T, Index, NumDims, Layout> + TensorBlockMapper; + + typedef internal::TensorBlockReader<T, Index, NumDims, Layout> + TensorBlockReader; + typedef internal::TensorBlockWriter<T, Index, NumDims, Layout> + TensorBlockWriter; + + DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>(); + const auto input_tensor_size = input_tensor_dims.TotalSize(); + + T* input_data = GenerateRandomData<T>(input_tensor_size); + T* output_data = new T[input_tensor_size]; + + TensorBlockMapper block_mapper(input_tensor_dims, RandomShape(), + RandomTargetSize(input_tensor_dims)); + T* block_data = new T[block_mapper.block_dims_total_size()]; + + for (int i = 0; i < block_mapper.total_block_count(); ++i) { + TensorBlock block = block_mapper.GetBlockForIndex(i, block_data); + TensorBlockReader::Run(&block, input_data); + TensorBlockWriter::Run(block, output_data); + } + + for (int i = 0; i < input_tensor_size; ++i) { + VERIFY_IS_EQUAL(input_data[i], output_data[i]); + } + + delete[] input_data; + delete[] output_data; + delete[] block_data; +} + +template <int Layout, int NumDims> +static int GetInputIndex(Index output_index, + const array<Index, NumDims>& output_to_input_dim_map, + const array<Index, NumDims>& input_strides, + const array<Index, NumDims>& output_strides) { + int input_index = 0; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const int idx = output_index / output_strides[i]; + input_index += idx * input_strides[output_to_input_dim_map[i]]; + output_index -= idx * output_strides[i]; + } + return input_index + + output_index * input_strides[output_to_input_dim_map[0]]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const int idx = output_index / output_strides[i]; + input_index += idx * input_strides[output_to_input_dim_map[i]]; + output_index -= idx * output_strides[i]; + } + return input_index + + output_index * input_strides[output_to_input_dim_map[NumDims - 1]]; + } +} + +template <int Layout, int NumDims> +static array<Index, NumDims> ComputeStrides( + const array<Index, NumDims>& sizes) { + array<Index, NumDims> strides; + if (Layout == ColMajor) { + strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + strides[i] = strides[i - 1] * sizes[i - 1]; + } + } else { + strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * sizes[i + 1]; + } + } + return strides; +} + +template <typename T, int NumDims, int Layout> +static void test_block_io_copy_using_reordered_dimensions() { + typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock; + typedef internal::TensorBlockMapper<T, Index, NumDims, Layout> + TensorBlockMapper; + + typedef internal::TensorBlockReader<T, Index, NumDims, Layout> + TensorBlockReader; + typedef internal::TensorBlockWriter<T, Index, NumDims, Layout> + TensorBlockWriter; + + DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>(); + const auto input_tensor_size = input_tensor_dims.TotalSize(); + + // Create a random input tensor. + T* input_data = GenerateRandomData<T>(input_tensor_size); + + // Create a random dimension re-ordering/shuffle. + std::vector<Index> shuffle; + for (int i = 0; i < NumDims; ++i) shuffle.push_back(i); + std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937()); + + DSizes<Index, NumDims> output_tensor_dims; + array<Index, NumDims> input_to_output_dim_map; + array<Index, NumDims> output_to_input_dim_map; + for (Index i = 0; i < NumDims; ++i) { + output_tensor_dims[shuffle[i]] = input_tensor_dims[i]; + input_to_output_dim_map[i] = shuffle[i]; + output_to_input_dim_map[shuffle[i]] = i; + } + + // Random block shape and size. + TensorBlockMapper block_mapper(output_tensor_dims, RandomShape(), + RandomTargetSize(input_tensor_dims)); + + auto* block_data = new T[block_mapper.block_dims_total_size()]; + auto* output_data = new T[input_tensor_size]; + + array<Index, NumDims> input_tensor_strides = + ComputeStrides<Layout, NumDims>(input_tensor_dims); + array<Index, NumDims> output_tensor_strides = + ComputeStrides<Layout, NumDims>(output_tensor_dims); + + for (Index i = 0; i < block_mapper.total_block_count(); ++i) { + TensorBlock block = block_mapper.GetBlockForIndex(i, block_data); + const Index first_coeff_index = GetInputIndex<Layout, NumDims>( + block.first_coeff_index(), output_to_input_dim_map, + input_tensor_strides, output_tensor_strides); + TensorBlockReader::Run(&block, first_coeff_index, input_to_output_dim_map, + input_tensor_strides, input_data); + TensorBlockWriter::Run(block, first_coeff_index, input_to_output_dim_map, + input_tensor_strides, output_data); + } + + for (int i = 0; i < input_tensor_size; ++i) { + VERIFY_IS_EQUAL(input_data[i], output_data[i]); + } + + delete[] input_data; + delete[] block_data; + delete[] output_data; +} + +template <int Layout> +static void test_block_io_zero_stride() +{ + typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock; + typedef internal::TensorBlockReader<float, Index, 5, Layout> + TensorBlockReader; + typedef internal::TensorBlockWriter<float, Index, 5, Layout> + TensorBlockWriter; + + DSizes<Index, 5> rnd_dims = RandomDims<5>(); + + DSizes<Index, 5> input_tensor_dims = rnd_dims; + input_tensor_dims[0] = 1; + input_tensor_dims[2] = 1; + input_tensor_dims[4] = 1; + const auto input_tensor_size = input_tensor_dims.TotalSize(); + auto* input_data = GenerateRandomData<float>(input_tensor_size); + + DSizes<Index, 5> output_tensor_dims = rnd_dims; + + DSizes<Index, 5> input_tensor_strides( + ComputeStrides<Layout, 5>(input_tensor_dims)); + DSizes<Index, 5> output_tensor_strides( + ComputeStrides<Layout, 5>(output_tensor_dims)); + + DSizes<Index, 5> input_tensor_strides_with_zeros(input_tensor_strides); + input_tensor_strides_with_zeros[0] = 0; + input_tensor_strides_with_zeros[2] = 0; + input_tensor_strides_with_zeros[4] = 0; + + // Verify that data was correctly read/written from/into the block. + const auto verify_is_equal = [&](const float* output_data) { + for (int i = 0; i < output_tensor_dims[0]; ++i) { + for (int j = 0; j < output_tensor_dims[1]; ++j) { + for (int k = 0; k < output_tensor_dims[2]; ++k) { + for (int l = 0; l < output_tensor_dims[3]; ++l) { + for (int m = 0; m < output_tensor_dims[4]; ++m) { + const Index output_offset = + i * output_tensor_strides[0] + j * output_tensor_strides[1] + + k * output_tensor_strides[2] + l * output_tensor_strides[3] + + m * output_tensor_strides[4]; + const Index input_offset = + i % input_tensor_dims[0] * input_tensor_strides[0] + + j % input_tensor_dims[1] * input_tensor_strides[1] + + k % input_tensor_dims[2] * input_tensor_strides[2] + + l % input_tensor_dims[3] * input_tensor_strides[3] + + m % input_tensor_dims[4] * input_tensor_strides[4]; + VERIFY_IS_EQUAL(output_data[output_offset], + input_data[input_offset]); + } + } + } + } + } + }; + + { + auto* output_data = new float[output_tensor_dims.TotalSize()]; + TensorBlock read_block(0, output_tensor_dims, output_tensor_strides, + input_tensor_strides_with_zeros, output_data); + TensorBlockReader::Run(&read_block, input_data); + verify_is_equal(output_data); + delete[] output_data; + } + + { + auto* output_data = new float[output_tensor_dims.TotalSize()]; + TensorBlock write_block(0, output_tensor_dims, + input_tensor_strides_with_zeros, + output_tensor_strides, input_data); + TensorBlockWriter::Run(write_block, output_data); + verify_is_equal(output_data); + delete[] output_data; + } + + delete[] input_data; +} + +template <int Layout> +static void test_block_io_squeeze_ones() { + typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock; + typedef internal::TensorBlockReader<float, Index, 5, Layout> + TensorBlockReader; + typedef internal::TensorBlockWriter<float, Index, 5, Layout> + TensorBlockWriter; + + // Total size > 1. + { + DSizes<Index, 5> block_sizes(1, 2, 1, 2, 1); + const auto total_size = block_sizes.TotalSize(); + + // Create a random input tensor. + auto* input_data = GenerateRandomData<float>(total_size); + DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes)); + + { + auto* output_data = new float[block_sizes.TotalSize()]; + TensorBlock read_block(0, block_sizes, strides, strides, output_data); + TensorBlockReader::Run(&read_block, input_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], input_data[i]); + } + delete[] output_data; + } + + { + auto* output_data = new float[block_sizes.TotalSize()]; + TensorBlock write_block(0, block_sizes, strides, strides, input_data); + TensorBlockWriter::Run(write_block, output_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], input_data[i]); + } + delete[] output_data; + } + } + + // Total size == 1. + { + DSizes<Index, 5> block_sizes(1, 1, 1, 1, 1); + const auto total_size = block_sizes.TotalSize(); + + // Create a random input tensor. + auto* input_data = GenerateRandomData<float>(total_size); + DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes)); + + { + auto* output_data = new float[block_sizes.TotalSize()]; + TensorBlock read_block(0, block_sizes, strides, strides, output_data); + TensorBlockReader::Run(&read_block, input_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], input_data[i]); + } + delete[] output_data; + } + + { + auto* output_data = new float[block_sizes.TotalSize()]; + TensorBlock write_block(0, block_sizes, strides, strides, input_data); + TensorBlockWriter::Run(write_block, output_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], input_data[i]); + } + delete[] output_data; + } + } +} + +template <typename T, int NumDims, int Layout> +static void test_block_cwise_binary_io_basic() { + typedef internal::scalar_sum_op<T> BinaryFunctor; + typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, T, NumDims, + Layout> + TensorBlockCwiseBinaryIO; + + DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>(); + DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes)); + + const auto total_size = block_sizes.TotalSize(); + + // Create a random input tensors. + T* left_data = GenerateRandomData<T>(total_size); + T* right_data = GenerateRandomData<T>(total_size); + + T* output_data = new T[total_size]; + BinaryFunctor functor; + TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data, + strides, left_data, strides, right_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i])); + } + + delete[] left_data; + delete[] right_data; + delete[] output_data; +} + +template <int Layout> +static void test_block_cwise_binary_io_squeeze_ones() { + typedef internal::scalar_sum_op<float> BinaryFunctor; + typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5, + Layout> + TensorBlockCwiseBinaryIO; + + DSizes<Index, 5> block_sizes(1, 2, 1, 3, 1); + DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes)); + + const auto total_size = block_sizes.TotalSize(); + + // Create a random input tensors. + auto* left_data = GenerateRandomData<float>(total_size); + auto* right_data = GenerateRandomData<float>(total_size); + + auto* output_data = new float[total_size]; + BinaryFunctor functor; + TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data, + strides, left_data, strides, right_data); + for (int i = 0; i < total_size; ++i) { + VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i])); + } + + delete[] left_data; + delete[] right_data; + delete[] output_data; } + +template <int Layout> +static void test_block_cwise_binary_io_zero_strides() { + typedef internal::scalar_sum_op<float> BinaryFunctor; + typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5, + Layout> + TensorBlockCwiseBinaryIO; + + DSizes<Index, 5> rnd_dims = RandomDims<5>(); + + DSizes<Index, 5> left_sizes = rnd_dims; + left_sizes[0] = 1; + left_sizes[2] = 1; + left_sizes[4] = 1; + + DSizes<Index, 5> left_strides(ComputeStrides<Layout, 5>(left_sizes)); + left_strides[0] = 0; + left_strides[2] = 0; + left_strides[4] = 0; + + DSizes<Index, 5> right_sizes = rnd_dims; + right_sizes[1] = 0; + right_sizes[3] = 0; + + DSizes<Index, 5> right_strides(ComputeStrides<Layout, 5>(right_sizes)); + right_strides[1] = 0; + right_strides[3] = 0; + + // Generate random data. + auto* left_data = GenerateRandomData<float>(left_sizes.TotalSize()); + auto* right_data = GenerateRandomData<float>(right_sizes.TotalSize()); + + DSizes<Index, 5> output_sizes = rnd_dims; + DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes)); + + const auto output_total_size = output_sizes.TotalSize(); + auto* output_data = new float[output_total_size]; + + BinaryFunctor functor; + TensorBlockCwiseBinaryIO::Run(functor, output_sizes, output_strides, + output_data, left_strides, left_data, + right_strides, right_data); + for (int i = 0; i < rnd_dims[0]; ++i) { + for (int j = 0; j < rnd_dims[1]; ++j) { + for (int k = 0; k < rnd_dims[2]; ++k) { + for (int l = 0; l < rnd_dims[3]; ++l) { + for (int m = 0; m < rnd_dims[4]; ++m) { + Index output_index = i * output_strides[0] + j * output_strides[1] + + k * output_strides[2] + l * output_strides[3] + + m * output_strides[4]; + Index left_index = i * left_strides[0] + j * left_strides[1] + + k * left_strides[2] + l * left_strides[3] + + m * left_strides[4]; + Index right_index = i * right_strides[0] + j * right_strides[1] + + k * right_strides[2] + l * right_strides[3] + + m * right_strides[4]; + VERIFY_IS_EQUAL( + output_data[output_index], + functor(left_data[left_index], right_data[right_index])); + } + } + } + } + } + + delete[] left_data; + delete[] right_data; + delete[] output_data; +} + +template <int Layout> +static void test_uniform_block_shape() +{ + using T = int; + typedef internal::TensorBlock<T, Index, 5, Layout> TensorBlock; + typedef internal::TensorBlockMapper<T, Index, 5, Layout> TensorBlockMapper; + + { + // Test shape 'UniformAllDims' with uniform 'max_coeff count'. + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 5 * 5 * 5 * 5 * 5; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + for (int i = 0; i < 5; ++i) { + VERIFY_IS_EQUAL(5, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } + + // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills + // partially into first inner-most dimension. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 7 * 5 * 5 * 5 * 5; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[0]); + for (int i = 1; i < 5; ++i) { + VERIFY_IS_EQUAL(5, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 5 * 5 * 5 * 5 * 6; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(6, block.block_sizes()[4]); + for (int i = 3; i >= 0; --i) { + VERIFY_IS_EQUAL(5, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } + + // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills + // fully into first inner-most dimension. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 11 * 5 * 5 * 5 * 5; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(11, block.block_sizes()[0]); + for (int i = 1; i < 5; ++i) { + VERIFY_IS_EQUAL(5, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 5 * 5 * 5 * 5 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + for (int i = 3; i >= 0; --i) { + VERIFY_IS_EQUAL(5, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } + + // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills + // fully into first few inner-most dimensions. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(7, 5, 6, 17, 7); + const size_t max_coeff_count = 7 * 5 * 6 * 7 * 5; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[0]); + VERIFY_IS_EQUAL(5, block.block_sizes()[1]); + VERIFY_IS_EQUAL(6, block.block_sizes()[2]); + VERIFY_IS_EQUAL(7, block.block_sizes()[3]); + VERIFY_IS_EQUAL(5, block.block_sizes()[4]); + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(7, 5, 6, 9, 7); + const size_t max_coeff_count = 5 * 5 * 5 * 6 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + VERIFY_IS_EQUAL(6, block.block_sizes()[3]); + VERIFY_IS_EQUAL(5, block.block_sizes()[2]); + VERIFY_IS_EQUAL(5, block.block_sizes()[1]); + VERIFY_IS_EQUAL(5, block.block_sizes()[0]); + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } + + // Test shape 'UniformAllDims' with full allocation to all dims. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(7, 5, 6, 17, 7); + const size_t max_coeff_count = 7 * 5 * 6 * 17 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[0]); + VERIFY_IS_EQUAL(5, block.block_sizes()[1]); + VERIFY_IS_EQUAL(6, block.block_sizes()[2]); + VERIFY_IS_EQUAL(17, block.block_sizes()[3]); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(7, 5, 6, 9, 7); + const size_t max_coeff_count = 7 * 5 * 6 * 9 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kUniformAllDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + VERIFY_IS_EQUAL(9, block.block_sizes()[3]); + VERIFY_IS_EQUAL(6, block.block_sizes()[2]); + VERIFY_IS_EQUAL(5, block.block_sizes()[1]); + VERIFY_IS_EQUAL(7, block.block_sizes()[0]); + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } +} + +template <int Layout> +static void test_skewed_inner_dim_block_shape() +{ + using T = int; + typedef internal::TensorBlock<T, Index, 5, Layout> TensorBlock; + typedef internal::TensorBlockMapper<T, Index, 5, Layout> TensorBlockMapper; + + // Test shape 'SkewedInnerDims' with partial allocation to inner-most dim. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 10 * 1 * 1 * 1 * 1; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(10, block.block_sizes()[0]); + for (int i = 1; i < 5; ++i) { + VERIFY_IS_EQUAL(1, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 1 * 1 * 1 * 1 * 6; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(6, block.block_sizes()[4]); + for (int i = 3; i >= 0; --i) { + VERIFY_IS_EQUAL(1, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } + + // Test shape 'SkewedInnerDims' with full allocation to inner-most dim. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 11 * 1 * 1 * 1 * 1; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(11, block.block_sizes()[0]); + for (int i = 1; i < 5; ++i) { + VERIFY_IS_EQUAL(1, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 1 * 1 * 1 * 1 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + for (int i = 3; i >= 0; --i) { + VERIFY_IS_EQUAL(1, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } + + // Test shape 'SkewedInnerDims' with full allocation to inner-most dim, + // and partial allocation to second inner-dim. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 11 * 3 * 1 * 1 * 1; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(11, block.block_sizes()[0]); + VERIFY_IS_EQUAL(3, block.block_sizes()[1]); + for (int i = 2; i < 5; ++i) { + VERIFY_IS_EQUAL(1, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 1 * 1 * 1 * 15 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + VERIFY_IS_EQUAL(15, block.block_sizes()[3]); + for (int i = 2; i >= 0; --i) { + VERIFY_IS_EQUAL(1, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } + + // Test shape 'SkewedInnerDims' with full allocation to inner-most dim, + // and partial allocation to third inner-dim. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 11 * 5 * 5 * 1 * 1; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(11, block.block_sizes()[0]); + VERIFY_IS_EQUAL(5, block.block_sizes()[1]); + VERIFY_IS_EQUAL(5, block.block_sizes()[2]); + for (int i = 3; i < 5; ++i) { + VERIFY_IS_EQUAL(1, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 1 * 1 * 5 * 17 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + VERIFY_IS_EQUAL(17, block.block_sizes()[3]); + VERIFY_IS_EQUAL(5, block.block_sizes()[2]); + for (int i = 1; i >= 0; --i) { + VERIFY_IS_EQUAL(1, block.block_sizes()[i]); + } + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } + + // Test shape 'SkewedInnerDims' with full allocation to all dims. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 11 * 5 * 6 * 17 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(11, block.block_sizes()[0]); + VERIFY_IS_EQUAL(5, block.block_sizes()[1]); + VERIFY_IS_EQUAL(6, block.block_sizes()[2]); + VERIFY_IS_EQUAL(17, block.block_sizes()[3]); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const size_t max_coeff_count = 11 * 5 * 6 * 17 * 7; + TensorBlockMapper block_mapper(dims, TensorBlockShapeType::kSkewedInnerDims, + max_coeff_count); + TensorBlock block = block_mapper.GetBlockForIndex(0, nullptr); + VERIFY_IS_EQUAL(7, block.block_sizes()[4]); + VERIFY_IS_EQUAL(17, block.block_sizes()[3]); + VERIFY_IS_EQUAL(6, block.block_sizes()[2]); + VERIFY_IS_EQUAL(5, block.block_sizes()[1]); + VERIFY_IS_EQUAL(11, block.block_sizes()[0]); + VERIFY(block.block_sizes().TotalSize() <= max_coeff_count); + } +} + +template <int Layout> +static void test_empty_dims(const internal::TensorBlockShapeType block_shape) +{ + using T = int; + + // Test blocking of tensors with zero dimensions: + // - we must not crash on asserts and divisions by zero + // - we must not return block with zero dimensions + // (recipe for overflows/underflows, divisions by zero and NaNs later) + // - total block count must be zero + { + typedef internal::TensorBlockMapper<T, Index, 1, Layout> TensorBlockMapper; + DSizes<Index, 1> dims(0); + for (int max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) { + TensorBlockMapper block_mapper(dims, block_shape, max_coeff_count); + VERIFY_IS_EQUAL(block_mapper.total_block_count(), 0); + VERIFY(block_mapper.block_dims_total_size() >= 1); + } + } + + { + typedef internal::TensorBlockMapper<T, Index, 2, Layout> TensorBlockMapper; + for (int dim1 = 0; dim1 < 3; ++dim1) { + for (int dim2 = 0; dim2 < 3; ++dim2) { + DSizes<Index, 2> dims(dim1, dim2); + for (int max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) { + TensorBlockMapper block_mapper(dims, block_shape, max_coeff_count); + if (dim1 * dim2 == 0) { + VERIFY_IS_EQUAL(block_mapper.total_block_count(), 0); + } + VERIFY(block_mapper.block_dims_total_size() >= 1); + } + } + } + } +} + +#define TEST_LAYOUTS(NAME) \ + CALL_SUBTEST(NAME<ColMajor>()); \ + CALL_SUBTEST(NAME<RowMajor>()) + +#define TEST_LAYOUTS_AND_DIMS(TYPE, NAME) \ + CALL_SUBTEST((NAME<TYPE, 1, ColMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 1, RowMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 2, ColMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 2, RowMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 3, ColMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 3, RowMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 4, ColMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 4, RowMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 5, ColMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 5, RowMajor>())) + +#define TEST_LAYOUTS_WITH_ARG(NAME, ARG) \ + CALL_SUBTEST(NAME<ColMajor>(ARG)); \ + CALL_SUBTEST(NAME<RowMajor>(ARG)) + +EIGEN_DECLARE_TEST(cxx11_tensor_block_access) { + TEST_LAYOUTS(test_block_mapper_sanity); + TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element); + TEST_LAYOUTS_AND_DIMS(float, test_slice_block_mapper_maps_every_element); + TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_data_from_source_to_target); + TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_data_from_source_to_target); + TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_using_reordered_dimensions); + TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_using_reordered_dimensions); + TEST_LAYOUTS(test_block_io_zero_stride); + TEST_LAYOUTS(test_block_io_squeeze_ones); + TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_binary_io_basic); + TEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones); + TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides); + TEST_LAYOUTS(test_uniform_block_shape); + TEST_LAYOUTS(test_skewed_inner_dim_block_shape); + TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims); + TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims); +} + +#undef TEST_LAYOUTS +#undef TEST_LAYOUTS_WITH_ARG
\ No newline at end of file diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu index aa28457b1..f2a2a6cfa 100644 --- a/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu +++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu @@ -93,7 +93,7 @@ void test_cuda_complex_cwise_ops() { } -void test_cxx11_tensor_complex_cwise_ops() +EIGEN_DECLARE_TEST(test_cxx11_tensor_complex_cwise_ops) { CALL_SUBTEST(test_cuda_complex_cwise_ops<float>()); CALL_SUBTEST(test_cuda_complex_cwise_ops<double>()); diff --git a/unsupported/test/cxx11_tensor_complex_gpu.cu b/unsupported/test/cxx11_tensor_complex_gpu.cu index 7cf06aa7a..f8b8ae704 100644 --- a/unsupported/test/cxx11_tensor_complex_gpu.cu +++ b/unsupported/test/cxx11_tensor_complex_gpu.cu @@ -177,7 +177,7 @@ static void test_cuda_product_reductions() { } -void test_cxx11_tensor_complex() +EIGEN_DECLARE_TEST(test_cxx11_tensor_complex) { CALL_SUBTEST(test_cuda_nullary()); CALL_SUBTEST(test_cuda_sum_reductions()); diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp new file mode 100644 index 000000000..274f901ce --- /dev/null +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -0,0 +1,87 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_USE_THREADS + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::RowMajor; +using Eigen::ColMajor; + +// A set of tests to verify that different TensorExecutor strategies yields the +// same results for all the ops, supporting tiled execution. + +template <typename Device, bool Vectorizable, bool Tileable, int Layout> +static void test_execute_binary_expr(Device d) { + // Pick a large enough tensor size to bypass small tensor block evaluation + // optimization. + int d0 = internal::random<int>(100, 200); + int d1 = internal::random<int>(100, 200); + int d2 = internal::random<int>(100, 200); + + static constexpr int Options = 0; + using IndexType = int; + + Tensor<float, 3, Options, IndexType> lhs(d0, d1, d2); + Tensor<float, 3, Options, IndexType> rhs(d0, d1, d2); + Tensor<float, 3, Options, IndexType> dst(d0, d1, d2); + + lhs.setRandom(); + rhs.setRandom(); + + const auto expr = lhs + rhs; + + using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; + using Executor = + internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>; + + Executor::run(Assign(dst, expr), d); + + for (int i = 0; i < d0; ++i) { + for (int j = 0; j < d1; ++j) { + for (int k = 0; k < d2; ++k) { + float sum = lhs(i, j, k) + rhs(i, j, k); + VERIFY_IS_EQUAL(sum, dst(i, j, k)); + } + } + } +} + +#define CALL_SUBTEST_COMBINATIONS(NAME) \ + CALL_SUBTEST((NAME<DefaultDevice, false, false, ColMajor>(default_device))); \ + CALL_SUBTEST((NAME<DefaultDevice, false, true, ColMajor>(default_device))); \ + CALL_SUBTEST((NAME<DefaultDevice, true, false, ColMajor>(default_device))); \ + CALL_SUBTEST((NAME<DefaultDevice, true, true, ColMajor>(default_device))); \ + CALL_SUBTEST((NAME<DefaultDevice, false, false, RowMajor>(default_device))); \ + CALL_SUBTEST((NAME<DefaultDevice, false, true, RowMajor>(default_device))); \ + CALL_SUBTEST((NAME<DefaultDevice, true, false, RowMajor>(default_device))); \ + CALL_SUBTEST((NAME<DefaultDevice, true, true, RowMajor>(default_device))); \ + CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, ColMajor>(tp_device))); \ + CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, ColMajor>(tp_device))); \ + CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, ColMajor>(tp_device))); \ + CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, ColMajor>(tp_device))); \ + CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, RowMajor>(tp_device))); \ + CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, RowMajor>(tp_device))); \ + CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, RowMajor>(tp_device))); \ + CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, RowMajor>(tp_device))) + +EIGEN_DECLARE_TEST(cxx11_tensor_executor) { + Eigen::DefaultDevice default_device; + + const auto num_threads = internal::random<int>(1, 24); + Eigen::ThreadPool tp(num_threads); + Eigen::ThreadPoolDevice tp_device(&tp, num_threads); + + CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr); +} + +#undef CALL_SUBTEST_COMBINATIONS |