From 6913221c43c6ad41b1fbfc0d263d2764abd11ad2 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 25 Jul 2018 13:51:10 -0700 Subject: Add tiled evaluation support to TensorExecutor --- unsupported/Eigen/CXX11/Tensor | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 44 +++- unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 113 ++++++++- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 2 + .../Eigen/CXX11/src/Tensor/TensorChipping.h | 2 + .../Eigen/CXX11/src/Tensor/TensorConcatenation.h | 2 + .../Eigen/CXX11/src/Tensor/TensorContraction.h | 1 + .../Eigen/CXX11/src/Tensor/TensorConversion.h | 1 + .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 10 +- .../Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h | 1 + .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 16 ++ unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 1 + .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 101 +++++++- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 262 ++++++++++++++++----- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 2 + .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 1 + .../CXX11/src/Tensor/TensorForwardDeclarations.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 1 + .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 2 + .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 8 +- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 1 + .../Eigen/CXX11/src/Tensor/TensorReduction.h | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorRef.h | 3 + unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 2 + .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 2 + .../Eigen/CXX11/src/Tensor/TensorStriding.h | 2 + unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h | 7 +- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_block_access.cpp | 2 +- .../test/cxx11_tensor_complex_cwise_ops_gpu.cu | 2 +- unsupported/test/cxx11_tensor_complex_gpu.cu | 2 +- unsupported/test/cxx11_tensor_executor.cpp | 81 +++++++ 33 files changed, 596 insertions(+), 91 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_executor.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 397d55f76..47514703a 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -112,13 +112,13 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorGlobalFunctions.h" #include "src/Tensor/TensorBase.h" +#include "src/Tensor/TensorBlock.h" #include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorExpr.h" #include "src/Tensor/TensorReduction.h" #include "src/Tensor/TensorReductionGpu.h" #include "src/Tensor/TensorArgMax.h" -#include "src/Tensor/TensorBlock.h" #include "src/Tensor/TensorConcatenation.h" #include "src/Tensor/TensorContractionMapper.h" #include "src/Tensor/TensorContractionBlocking.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 027305586..199ddb123 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -68,6 +68,8 @@ class TensorAssignOp : public TensorBase typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; + static const int NumDims = Eigen::internal::traits::NumDimensions; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} @@ -95,20 +97,33 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; + static const int PacketSize = internal::unpacket_traits::size; + static const int NumDims = XprType::NumDims; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = TensorEvaluator::RawAccess + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & + TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess & + TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = TensorEvaluator::RawAccess }; + typedef typename internal::TensorBlock< + typename internal::remove_const::type, Index, NumDims, Layout> + TensorBlock; + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT( + (static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); } EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -164,6 +179,25 @@ struct TensorEvaluator, Device> TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + m_leftImpl.getResourceRequirements(resources); + m_rightImpl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) { + if (TensorEvaluator::RawAccess && + m_leftImpl.data() != nullptr) { + TensorBlock left_block(block->first_coeff_index(), block->block_sizes(), + block->tensor_strides(), block->tensor_strides(), + m_leftImpl.data() + block->first_coeff_index()); + m_rightImpl.block(&left_block); + } else { + m_rightImpl.block(block); + m_leftImpl.writeBlock(*block); + } + } + /// required by sycl in order to extract the accessor const TensorEvaluator& left_impl() const { return m_leftImpl; } /// required by sycl in order to extract the accessor diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 8ffc9d093..5321acecf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -65,6 +65,40 @@ enum class TensorBlockShapeType { kSkewedInnerDims, }; +struct TensorOpResourceRequirements { + TensorBlockShapeType block_shape; + std::size_t block_total_size; + // TODO(andydavis) Add 'target_num_threads' to support communication of + // thread-resource requirements. This will allow ops deep in the + // expression tree (like reductions) to communicate resources + // requirements based on local state (like the total number of reductions + // to be computed). + TensorOpResourceRequirements(internal::TensorBlockShapeType shape, + const std::size_t size) + : block_shape(shape), block_total_size(size) {} +}; + +// Tries to merge multiple resource requirements. +EIGEN_STRONG_INLINE void MergeResourceRequirements( + const std::vector& resources, + TensorBlockShapeType* block_shape, std::size_t* block_total_size) { + if (resources.empty()) { + return; + } + // TODO(andydavis) Implement different policies (i.e. revert to a default + // policy if block shapes/sizes conflict). + *block_shape = resources[0].block_shape; + *block_total_size = resources[0].block_total_size; + for (int i = 1; i < resources.size(); ++i) { + if (resources[i].block_shape == TensorBlockShapeType::kSkewedInnerDims && + *block_shape != TensorBlockShapeType::kSkewedInnerDims) { + *block_shape = TensorBlockShapeType::kSkewedInnerDims; + } + *block_total_size = + numext::maxi(*block_total_size, resources[i].block_total_size); + } +} + /** * \class TensorBlock * \ingroup CXX11_Tensor_Module @@ -74,7 +108,7 @@ enum class TensorBlockShapeType { * This class represents a tensor block specified by the index of the * first block coefficient, and the size of the block in each dimension. */ -template +template class TensorBlock { public: typedef DSizes Dimensions; @@ -614,6 +648,83 @@ struct TensorBlockCwiseBinaryIO { } }; +/** + * \class TensorBlockView + * \ingroup CXX11_Tensor_Module + * + * \brief Read-only view into a block of data. + * + * This class provides read-only access to a block of data in impl. It may need + * to allocate space for holding the intermediate result. + * + */ +template +struct TensorBlockView { + typedef TensorEvaluator Impl; + typedef typename Impl::Index Index; + typedef typename remove_const::type Scalar; + static const int NumDims = array_size::value; + typedef DSizes Dimensions; + + // Constructs a TensorBlockView for `impl`. `block` is only used for for + // specifying the start offset, shape, and strides of the block. + template + TensorBlockView(const Device& device, + const TensorEvaluator& impl, + const OtherTensorBlock& block) + : m_device(device), + m_block_sizes(block.block_sizes()), + m_data(NULL), + m_allocated_data(NULL) { + if (Impl::RawAccess && impl.data() != NULL) { + m_data = impl.data() + block.first_coeff_index(); + m_block_strides = block.tensor_strides(); + } else { + // Actually make a copy. + + // TODO(wuke): This sometimes put a lot pressure on the heap allocator. + // Consider allowing ops to request additional temporary block memory in + // TensorOpResourceRequirements. + m_allocated_data = static_cast( + m_device.allocate(m_block_sizes.TotalSize() * sizeof(Scalar))); + m_data = m_allocated_data; + if (NumDims > 0) { + if (static_cast(Impl::Layout) == static_cast(ColMajor)) { + m_block_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_block_strides[i] = m_block_strides[i - 1] * m_block_sizes[i - 1]; + } + } else { + m_block_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_block_strides[i] = m_block_strides[i + 1] * m_block_sizes[i + 1]; + } + } + } + TensorBlock input_block( + block.first_coeff_index(), m_block_sizes, m_block_strides, + block.tensor_strides(), m_allocated_data); + impl.block(&input_block); + } + } + + ~TensorBlockView() { + if (m_allocated_data != NULL) { + m_device.deallocate(m_allocated_data); + } + } + + const Dimensions& block_sizes() const { return m_block_sizes; } + const Dimensions& block_strides() const { return m_block_strides; } + const Scalar* data() const { return m_data; } + + private: + const Device& m_device; + Dimensions m_block_sizes, m_block_strides; + const Scalar* m_data; // Not owned. + Scalar* m_allocated_data; // Owned. +}; + /** * \class TensorBlockMapper * \ingroup CXX11_Tensor_Module diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 278689915..7ff0d323b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -1,4 +1,5 @@ // This file is part of Eigen, a lightweight C++ template library +// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2014 Benoit Steiner @@ -110,6 +111,7 @@ struct TensorEvaluator, Device> enum { IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 21ffa2872..085c05f3d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -146,6 +146,7 @@ struct TensorEvaluator, Device> // slice offsets. IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -343,6 +344,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index a7c1380b8..9f0321880 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -122,6 +122,7 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = false }; @@ -306,6 +307,7 @@ template::PacketAccess & TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 0e69cd40c..12cfa8df3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -240,6 +240,7 @@ struct TensorContractionEvaluatorBase enum { IsAligned = true, PacketAccess = (internal::unpacket_traits::size > 1), + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 182bef918..e0cbbb315 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -195,6 +195,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = true, + BlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 25131600d..1ec5819a7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -307,6 +307,7 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -577,11 +578,11 @@ __global__ void EigenConvolutionKernel1D( const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) { #if defined(EIGEN_HIPCC) - HIP_DYNAMIC_SHARED(float, s) + HIP_DYNAMIC_SHARED(float, s) #else extern __shared__ float s[]; #endif - + const int first_x = blockIdx.x * maxX; const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; const int num_x_input = last_x - first_x + GetKernelSize()(kernelSize); @@ -630,7 +631,7 @@ __global__ void EigenConvolutionKernel2D( const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) { #if defined(EIGEN_HIPCC) - HIP_DYNAMIC_SHARED(float, s) + HIP_DYNAMIC_SHARED(float, s) #else extern __shared__ float s[]; #endif @@ -702,7 +703,7 @@ __global__ void EigenConvolutionKernel3D( const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) { #if defined(EIGEN_HIPCC) - HIP_DYNAMIC_SHARED(float, s) + HIP_DYNAMIC_SHARED(float, s) #else extern __shared__ float s[]; #endif @@ -778,6 +779,7 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 65403905a..d301d0c01 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -242,6 +242,7 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 192d4aa7b..5ca47cca7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -290,6 +290,22 @@ struct DSizes : array { } } +#ifndef EIGEN_EMULATE_CXX11_META_H + template + EIGEN_DEVICE_FUNC DSizes(const Sizes& a) { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } +#else + template + EIGEN_DEVICE_FUNC DSizes(const Sizes& a) { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } +#endif + #if EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index d0c027890..af39daa91 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -107,6 +107,7 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index fe62ff1ea..ba02802d2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -41,11 +41,24 @@ struct TensorEvaluator enum { IsAligned = Derived::IsAligned, PacketAccess = (internal::unpacket_traits::size > 1), + BlockAccess = internal::is_arithmetic::type>::value, Layout = Derived::Layout, CoordAccess = NumCoords > 0, RawAccess = true }; + typedef typename internal::TensorBlock< + typename internal::remove_const::type, Index, NumCoords, Layout> + TensorBlock; + typedef typename internal::TensorBlockReader< + typename internal::remove_const::type, Index, NumCoords, Layout, + PacketAccess> + TensorBlockReader; + typedef typename internal::TensorBlockWriter< + typename internal::remove_const::type, Index, NumCoords, Layout, + PacketAccess> + TensorBlockWriter; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) : m_data(const_cast::template MakePointer::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m) { } @@ -113,6 +126,20 @@ struct TensorEvaluator internal::unpacket_traits::size); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const { + assert(m_data != NULL); + TensorBlockReader::Run(block, m_data); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlock& block) { + assert(m_data != NULL); + TensorBlockWriter::Run(block, m_data); + } + EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } /// required by sycl in order to construct sycl buffer from raw pointer @@ -167,11 +194,20 @@ struct TensorEvaluator enum { IsAligned = Derived::IsAligned, PacketAccess = (internal::unpacket_traits::size > 1), + BlockAccess = internal::is_arithmetic::type>::value, Layout = Derived::Layout, CoordAccess = NumCoords > 0, RawAccess = true }; + typedef typename internal::TensorBlock< + typename internal::remove_const::type, Index, NumCoords, Layout> + TensorBlock; + typedef typename internal::TensorBlockReader< + typename internal::remove_const::type, Index, NumCoords, Layout, + PacketAccess> + TensorBlockReader; + // Used for accessor extraction in SYCL Managed TensorMap: const Derived& derived() const { return m_impl; } @@ -219,6 +255,14 @@ struct TensorEvaluator internal::unpacket_traits::size); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const { + assert(m_data != NULL); + TensorBlockReader::Run(block, m_data); + } + EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } /// added for sycl in order to construct the buffer from the sycl device @@ -244,6 +288,7 @@ struct TensorEvaluator, Device> enum { IsAligned = true, PacketAccess = internal::functor_traits::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -308,7 +353,9 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, + PacketAccess = TensorEvaluator::PacketAccess & + internal::functor_traits::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -375,16 +422,21 @@ struct TensorEvaluator XprType; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & + TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess & + TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), + : m_device(device), + m_functor(op.functor()), m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { @@ -399,6 +451,14 @@ struct TensorEvaluator::size; typedef typename TensorEvaluator::Dimensions Dimensions; + static const int NumDims = internal::array_size< + typename TensorEvaluator::Dimensions>::value; + + typedef internal::TensorBlock< + typename internal::remove_const::type, Index, NumDims, + TensorEvaluator::Layout> + TensorBlock; + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { // TODO: use right impl instead if right impl dimensions are known at compile time. @@ -433,6 +493,30 @@ struct TensorEvaluator* resources) const { + m_leftImpl.getResourceRequirements(resources); + m_rightImpl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + if (NumDims <= 0) { + output_block->data()[0] = coeff(0); + return; + } + internal::TensorBlockView left_block( + m_device, m_leftImpl, *output_block); + internal::TensorBlockView right_block( + m_device, m_rightImpl, *output_block); + internal::TensorBlockCwiseBinaryIO< + BinaryOp, Index, typename internal::remove_const::type, NumDims, + Layout>::Run(m_functor, output_block->block_sizes(), + output_block->block_strides(), output_block->data(), + left_block.block_strides(), left_block.data(), + right_block.block_strides(), right_block.data()); + } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } /// required by sycl in order to extract the accessor const TensorEvaluator& left_impl() const { return m_leftImpl; } @@ -442,6 +526,7 @@ struct TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; @@ -458,6 +543,7 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -562,6 +648,7 @@ struct TensorEvaluator IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & internal::packet_traits::HasBlend, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 53640c6aa..024de3696 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -12,29 +12,37 @@ namespace Eigen { -/** \class TensorExecutor - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor executor class. - * - * This class is responsible for launch the evaluation of the expression on - * the specified computing device. - */ +/** + * \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + * + * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and + * instructions) + * @tparam Tileable can use block based tensor evaluation + * (see TensorBlock.h) + */ namespace internal { -// Default strategy: the expression is evaluated with a single cpu thread. -template -class TensorExecutor -{ +/** + * Default strategy: the expression is evaluated sequentially with a single cpu + * thread, without vectorization and block evaluation. + */ +template +class TensorExecutor { public: typedef typename Expression::Index Index; EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const Device& device = Device()) - { + static inline void run(const Expression& expr, + const Device& device = Device()) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { + if (needs_assign) { const Index size = array_prod(evaluator.dimensions()); for (Index i = 0; i < size; ++i) { evaluator.evalScalar(i); @@ -44,12 +52,14 @@ class TensorExecutor } }; - -template -class TensorExecutor -{ +/** + * Process all the data with a single cpu thread, using vectorized instructions. + */ +template +class TensorExecutor { public: typedef typename Expression::Index Index; + EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) { @@ -58,9 +68,11 @@ class TensorExecutor if (needs_assign) { const Index size = array_prod(evaluator.dimensions()); - const int PacketSize = unpacket_traits::PacketReturnType>::size; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not + const int PacketSize = unpacket_traits::PacketReturnType>::size; + + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { @@ -80,9 +92,75 @@ class TensorExecutor } }; +/** + * Process all the data with a single cpu thread, using blocks of data. By + * sizing a block to fit L1 cache we get better cache performance. + */ +template +class TensorExecutor { + public: + typedef typename Expression::Index Index; + + EIGEN_DEVICE_FUNC + static inline void run(const Expression& expr, + const DefaultDevice& device = DefaultDevice()) { + using Evaluator = TensorEvaluator; + using Index = typename traits::Index; + const int NumDims = traits::NumDimensions; -// Multicore strategy: the index space is partitioned and each partition is executed on a single core + using Scalar = typename traits::Scalar; + using ScalarNoConst = typename remove_const::type; + + using TensorBlock = + TensorBlock; + using TensorBlockMapper = + TensorBlockMapper; + + Evaluator evaluator(expr, device); + std::size_t total_size = array_prod(evaluator.dimensions()); + std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + + if (total_size < cache_size) { + // TODO(andydavis) Reduce block management overhead for small tensors. + // TODO(wuke) Do not do this when evaluating TensorBroadcastingOp. + internal::TensorExecutor::run(expr, device); + return; + } + + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + // Size tensor blocks to fit in cache (or requested target block size). + size_t block_total_size = numext::mini(cache_size, total_size); + TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims; + // Query expression tree for desired block size/shape. + std::vector resources; + evaluator.getResourceRequirements(&resources); + MergeResourceRequirements(resources, &block_shape, &block_total_size); + + TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape, + block_total_size); + block_total_size = block_mapper.block_dims_total_size(); + + Scalar* data = static_cast( + device.allocate(block_total_size * sizeof(Scalar))); + + const Index total_block_count = block_mapper.total_block_count(); + for (Index i = 0; i < total_block_count; ++i) { + TensorBlock block = block_mapper.GetBlockForIndex(i, data); + evaluator.evalBlock(&block); + } + device.deallocate(data); + } + evaluator.cleanup(); + } +}; + +/** + * Multicore strategy: the index space is partitioned and each partition is + * executed on a single core. + */ #ifdef EIGEN_USE_THREADS template struct EvalRange { @@ -100,7 +178,7 @@ struct EvalRange { }; template -struct EvalRange { +struct EvalRange { static const int PacketSize = unpacket_traits::size; static void run(Evaluator* evaluator_in, const Index first, const Index last) { @@ -110,8 +188,8 @@ struct EvalRange { if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); Index last_chunk_offset = last - 4 * PacketSize; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. for (; i <= last_chunk_offset; i += 4*PacketSize) { for (Index j = 0; j < 4; j++) { @@ -138,55 +216,113 @@ struct EvalRange { } }; -template -class TensorExecutor { +template +class TensorExecutor { public: typedef typename Expression::Index Index; - static inline void run(const Expression& expr, const ThreadPoolDevice& device) - { + + static inline void run(const Expression& expr, + const ThreadPoolDevice& device) { typedef TensorEvaluator Evaluator; + typedef EvalRange EvalRange; + Evaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const Index PacketSize = + Vectorizable + ? unpacket_traits::size + : 1; const Index size = array_prod(evaluator.dimensions()); - size_t num_threads = device.numThreads(); - if (num_threads > 1) { - num_threads = TensorCostModel::numThreads( - size, evaluator.costPerCoeff(Vectorizable), num_threads); - } - if (num_threads == 1) { - EvalRange::run(&evaluator, 0, size); - } else { - const Index PacketSize = Vectorizable ? unpacket_traits::size : 1; - Index blocksz = std::ceil(static_cast(size)/num_threads) + PacketSize - 1; - const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; - - Barrier barrier(numblocks); - for (int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier( - &barrier, &EvalRange::run, - &evaluator, i * blocksize, (i + 1) * blocksize); - } - if (numblocks * blocksize < size) { - EvalRange::run( - &evaluator, numblocks * blocksize, size); - } - barrier.Wait(); - } + device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), + EvalRange::alignBlockSize, + [&evaluator](Index first, Index last) { + EvalRange::run(&evaluator, first, last); + }); + } + evaluator.cleanup(); + } +}; + +template +class TensorExecutor { + public: + typedef typename Expression::Index Index; + + static inline void run(const Expression& expr, + const ThreadPoolDevice& device) { + typedef TensorEvaluator Evaluator; + typedef typename internal::remove_const< + typename traits::Scalar>::type Scalar; + typedef typename traits::Index Index; + + static const int NumDims = traits::NumDimensions; + + typedef TensorBlock TensorBlock; + typedef TensorBlockMapper + TensorBlockMapper; + + Evaluator evaluator(expr, device); + std::size_t total_size = array_prod(evaluator.dimensions()); + std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + if (total_size < cache_size) { + // TODO(andydavis) Reduce block management overhead for small tensors. + internal::TensorExecutor::run(expr, device); + evaluator.cleanup(); + return; + } + + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims; + size_t block_total_size = 0; + // Query expression tree for desired block size/shape. + std::vector resources; + evaluator.getResourceRequirements(&resources); + MergeResourceRequirements(resources, &block_shape, &block_total_size); + int num_threads = device.numThreads(); + + // Estimate minimum block size based on cost. + TensorOpCost cost = evaluator.costPerCoeff(Vectorizable); + double taskSize = TensorCostModel::taskSize(1, cost); + size_t block_size = static_cast(1.0 / taskSize); + TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape, + block_size); + block_size = block_mapper.block_dims_total_size(); + const size_t aligned_blocksize = + EIGEN_MAX_ALIGN_BYTES * + divup(block_size * sizeof(Scalar), EIGEN_MAX_ALIGN_BYTES); + void* buf = device.allocate((num_threads + 1) * aligned_blocksize); + device.parallelFor( + block_mapper.total_block_count(), cost * block_size, + [=, &device, &evaluator, &block_mapper](Index first, Index last) { + // currentThreadId() returns -1 if called from a thread not in the + // threadpool, such as the main thread dispatching Eigen + // expressions. + const int thread_idx = device.currentThreadId(); + eigen_assert(thread_idx >= -1 && thread_idx < num_threads); + Scalar* thread_buf = reinterpret_cast( + static_cast(buf) + aligned_blocksize * (thread_idx + 1)); + for (Index i = first; i < last; ++i) { + auto block = block_mapper.GetBlockForIndex(i, thread_buf); + evaluator.evalBlock(&block); + } + }); + device.deallocate(buf); } evaluator.cleanup(); } }; + #endif // EIGEN_USE_THREADS // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) -template -class TensorExecutor { +template +class TensorExecutor { public: typedef typename Expression::Index Index; static void run(const Expression& expr, const GpuDevice& device); @@ -236,8 +372,8 @@ EigenMetaKernel(Evaluator eval, Index size) { } /*static*/ -template -inline void TensorExecutor::run( +template +inline void TensorExecutor::run( const Expression& expr, const GpuDevice& device) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index e943757ad..1342e47a6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -40,6 +40,8 @@ class TensorFixedSize : public TensorBase0), + PacketAccess = (internal::packet_traits::size > 1), + BlockAccess = false, Layout = Options_ & RowMajor ? RowMajor : ColMajor, CoordAccess = true, RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index b8f0bc798..fdb31928f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -98,6 +98,7 @@ struct TensorEvaluator, Device> enum { IsAligned = true, PacketAccess = (PacketSize > 1), + BlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = true }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 19e456e19..8ed1796df 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -129,8 +129,14 @@ struct IsVectorizable { TensorEvaluator::IsAligned; }; +template +struct IsTileable { + static const bool value = TensorEvaluator::BlockAccess; +}; + template ::value> + bool Vectorizable = IsVectorizable::value, + bool Tileable = IsTileable::value> class TensorExecutor; } // end namespace internal diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index f0f7c7826..72cb2d15f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -186,6 +186,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index 4e384f9b9..e3165fa10 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -119,6 +119,7 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator::RawAccess @@ -181,6 +182,7 @@ template enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false // to be implemented }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index cda49f8fe..498488649 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -105,6 +105,7 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator::RawAccess @@ -170,6 +171,7 @@ template enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator::RawAccess @@ -325,6 +327,7 @@ struct TensorEvaluator, Devi // slice offsets and sizes. IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false @@ -557,6 +560,7 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess @@ -716,7 +720,6 @@ struct TensorEvaluator::value; typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const::type ScalarNonConst; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef Strides Dimensions; @@ -858,7 +861,7 @@ struct TensorEvaluator::type ScalarNonConst; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef Strides Dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 5956e513d..ffa22f31e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -96,6 +96,7 @@ struct TensorEvaluator, Device enum { IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = true, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 9e0a20abf..950ac32af 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -94,6 +94,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index ce573d730..375fc0802 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -412,6 +412,7 @@ struct TensorEvaluator, enum { IsAligned = false, PacketAccess = Self::InputPacketAccess && Op::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index b2b4fd8d3..a6cade50f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -136,6 +136,7 @@ template class TensorRef : public TensorBase, Device> enum { IsAligned = false, PacketAccess = false, + BlockAccess = false, Layout = TensorRef::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -411,6 +413,7 @@ struct TensorEvaluator, Device> : public TensorEvaluator, Device enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -253,6 +254,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 0697fd1ce..6b54f40ad 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -112,6 +112,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -240,6 +241,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), + BlockAccess = false, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index a7eea99b6..c09513c10 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -112,6 +112,7 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false @@ -273,6 +274,7 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h index 2b1968de1..c8b2fad1e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -95,6 +95,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false @@ -110,7 +111,7 @@ struct TensorEvaluator, Device> for (int i = 0; i < NumInputDims; ++i) { m_reduced[i] = false; } - + const Dims& op_dims = op.dims(); for (int i = 0; i < NumReducedDims; ++i) { eigen_assert(op_dims[i] >= 0); @@ -128,7 +129,7 @@ struct TensorEvaluator, Device> eigen_assert(num_distinct_reduce_dims == NumReducedDims); - // Compute the dimensions of the result. + // Compute the dimensions of the result. const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); int output_index = 0; @@ -229,7 +230,7 @@ struct TensorEvaluator, Device> result += m_impl.coeff(cur_index); cur_index += index_stride; } - + return result; } diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index fa19b2159..239a80926 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -213,6 +213,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_striding) ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}") + ei_add_test(cxx11_tensor_executor "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_tensor_ref) ei_add_test(cxx11_tensor_random) ei_add_test(cxx11_tensor_generator) diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp index 15f2392a3..416b686e4 100644 --- a/unsupported/test/cxx11_tensor_block_access.cpp +++ b/unsupported/test/cxx11_tensor_block_access.cpp @@ -901,7 +901,7 @@ static void test_empty_dims(const internal::TensorBlockShapeType block_shape) CALL_SUBTEST(NAME(ARG)); \ CALL_SUBTEST(NAME(ARG)) -EIGEN_DECLARE_TEST(cxx11_tensor_assign) { +EIGEN_DECLARE_TEST(cxx11_tensor_block_access) { CALL_SUBTEST_LAYOUTS(test_block_mapper_sanity); CALL_SUBTEST_LAYOUTS(test_block_mapper_maps_every_element); CALL_SUBTEST_LAYOUTS(test_slice_block_mapper_maps_every_element); diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu index aa28457b1..f2a2a6cfa 100644 --- a/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu +++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu @@ -93,7 +93,7 @@ void test_cuda_complex_cwise_ops() { } -void test_cxx11_tensor_complex_cwise_ops() +EIGEN_DECLARE_TEST(test_cxx11_tensor_complex_cwise_ops) { CALL_SUBTEST(test_cuda_complex_cwise_ops()); CALL_SUBTEST(test_cuda_complex_cwise_ops()); diff --git a/unsupported/test/cxx11_tensor_complex_gpu.cu b/unsupported/test/cxx11_tensor_complex_gpu.cu index 7cf06aa7a..f8b8ae704 100644 --- a/unsupported/test/cxx11_tensor_complex_gpu.cu +++ b/unsupported/test/cxx11_tensor_complex_gpu.cu @@ -177,7 +177,7 @@ static void test_cuda_product_reductions() { } -void test_cxx11_tensor_complex() +EIGEN_DECLARE_TEST(test_cxx11_tensor_complex) { CALL_SUBTEST(test_cuda_nullary()); CALL_SUBTEST(test_cuda_sum_reductions()); diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp new file mode 100644 index 000000000..5ae45ac5b --- /dev/null +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -0,0 +1,81 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Eugene Zhulenev +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_USE_THREADS + +#include "main.h" + +#include + +using Eigen::Index; +using Eigen::Tensor; +using Eigen::RowMajor; +using Eigen::ColMajor; + +// A set of tests to verify that different TensorExecutor strategies yields the +// same results for all the ops, supporting tiled execution. + +template +static void test_execute_binary_expr(Device d) { + // Pick a large enough tensor size to bypass small tensor block evaluation + // optimization. + Tensor lhs(840, 390, 37); + Tensor rhs(840, 390, 37); + Tensor dst(840, 390, 37); + + lhs.setRandom(); + rhs.setRandom(); + + const auto expr = lhs + rhs; + + using Assign = TensorAssignOp; + using Executor = + internal::TensorExecutor; + + Executor::run(Assign(dst, expr), d); + + for (int i = 0; i < 840; ++i) { + for (int j = 0; j < 390; ++j) { + for (int k = 0; k < 37; ++k) { + float sum = lhs(i, j, k) + rhs(i, j, k); + VERIFY_IS_EQUAL(sum, dst(i, j, k)); + } + } + } +} + +#define CALL_SUBTEST_COMBINATIONS(NAME) \ + CALL_SUBTEST((NAME(default_device))); \ + CALL_SUBTEST((NAME(default_device))); \ + CALL_SUBTEST((NAME(default_device))); \ + CALL_SUBTEST((NAME(default_device))); \ + CALL_SUBTEST((NAME(default_device))); \ + CALL_SUBTEST((NAME(default_device))); \ + CALL_SUBTEST((NAME(default_device))); \ + CALL_SUBTEST((NAME(default_device))); \ + CALL_SUBTEST((NAME(tp_device))); \ + CALL_SUBTEST((NAME(tp_device))); \ + CALL_SUBTEST((NAME(tp_device))); \ + CALL_SUBTEST((NAME(tp_device))); \ + CALL_SUBTEST((NAME(tp_device))); \ + CALL_SUBTEST((NAME(tp_device))); \ + CALL_SUBTEST((NAME(tp_device))); \ + CALL_SUBTEST((NAME(tp_device))) + +EIGEN_DECLARE_TEST(cxx11_tensor_executor) { + Eigen::DefaultDevice default_device; + + const auto num_threads = internal::random(1, 24); + Eigen::ThreadPool tp(num_threads); + Eigen::ThreadPoolDevice tp_device(&tp, num_threads); + + CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr); +} + +#undef CALL_SUBTEST_COMBINATIONS -- cgit v1.2.3