From 4e2f6de1a8fd9a659dc40ed54fedff9abdef3b1f Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 1 Apr 2019 11:47:31 -0700 Subject: Add support for custom packed Lhs/Rhs blocks in tensor contractions --- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 202 +++++++++++++++++---- 1 file changed, 163 insertions(+), 39 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 6ca881f27..6a213096d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -105,7 +105,9 @@ struct traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; static const int Layout = traits::Layout; typedef typename conditional::val, - typename traits::PointerType, typename traits::PointerType>::type PointerType; + typename traits::PointerType, + typename traits::PointerType>::type + PointerType; enum { Flags = 0 @@ -136,6 +138,80 @@ struct traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; }; +// Helper class to allocate and deallocate temporary memory for packed buffers. +template +struct TensorContractionBlockMemAllocator { + typedef void* BlockMemHandle; + + template + EIGEN_DEVICE_FUNC static BlockMemHandle allocate(Device& d, const Index bm, + const Index bk, + const Index bn, + LhsScalar** lhs_block, + RhsScalar** rhs_block) { + eigen_assert(lhs_block); + eigen_assert(rhs_block); + BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); + char* block_mem = static_cast(d.allocate(sz.lhs_size + sz.rhs_size)); + eigen_assert(block_mem); + *lhs_block = reinterpret_cast(block_mem); + *rhs_block = reinterpret_cast(block_mem + sz.lhs_size); + return block_mem; + } + + template + EIGEN_DEVICE_FUNC static BlockMemHandle allocateSlices( + Device& d, const Index bm, const Index bk, const Index bn, + const Index num_lhs, const Index num_rhs, const Index num_slices, + std::vector* lhs_blocks, + std::vector* rhs_blocks) { + eigen_assert(num_slices > 0); + eigen_assert(num_lhs >= 0 && num_rhs >= 0) + eigen_assert(num_lhs == 0 || lhs_blocks); + eigen_assert(num_rhs == 0 || rhs_blocks); + BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); + void* block_mem = d.allocate( + (num_lhs * sz.lhs_size + num_rhs * sz.rhs_size) * num_slices); + eigen_assert(block_mem); + char* mem = static_cast(block_mem); + + for (Index x = 0; x < num_slices; x++) { + if (num_lhs > 0) lhs_blocks[x].resize(num_lhs); + for (Index m = 0; m < num_lhs; m++) { + lhs_blocks[x][m] = reinterpret_cast(mem); + mem += sz.lhs_size; + } + if (num_rhs > 0) rhs_blocks[x].resize(num_rhs); + for (Index n = 0; n < num_rhs; n++) { + rhs_blocks[x][n] = reinterpret_cast(mem); + mem += sz.rhs_size; + } + } + + return block_mem; + } + + template + EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) { + d.deallocate(handle); + } + + private: + struct BlockSizes { + Index lhs_size; + Index rhs_size; + }; + EIGEN_DEVICE_FUNC static BlockSizes ComputeLhsRhsBlockSizes(const Index bm, + const Index bk, + const Index bn) { + Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); + BlockSizes sz; + sz.lhs_size = divup(bm * bk * sizeof(LhsScalar), align) * align; + sz.rhs_size = divup(bn * bk * sizeof(RhsScalar), align) * align; + return sz; + } +}; + // WARNING: In this code we assume that Lhs and Rhs tensor expressions are in // ColMajor storage order. This property is guaranteed by the // TensorContractionOp evaluator. TensorContractionKernel specifies how we pack @@ -164,16 +240,28 @@ struct traits struct TensorContractionKernel { + TensorContractionKernel(StorageIndex m, StorageIndex k, StorageIndex n, + StorageIndex bm, StorageIndex bk, StorageIndex bn) + : m(m), k(k), n(n), bm(bm), bk(bk), bn(bn) {} + + // Pack blocks of Lhs and Rhs into contiguous blocks in memory. + typedef LhsScalar* LhsBlock; + typedef RhsScalar* RhsBlock; + + // Packed Lhs/Rhs block memory allocator. + typedef TensorContractionBlockMemAllocator + BlockMemAllocator; + typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle; + typedef typename internal::gebp_traits Traits; - typedef internal::gemm_pack_lhs + typedef internal::gemm_pack_lhs< + LhsScalar, StorageIndex, typename LhsMapper::SubMapper, Traits::mr, + Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> LhsPacker; typedef internal::gemm_pack_rhs GebpKernel; - EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE - static void packLhs(LhsScalar* lhsBlock, - const typename LhsMapper::SubMapper& data_mapper, - const StorageIndex depth, const StorageIndex rows) { - LhsPacker()(lhsBlock, data_mapper, depth, rows, /*stride*/ 0, /*offset*/ 0); + template + EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block, + RhsBlock* rhs_block) { + return BlockMemAllocator::allocate(d, bm, bk, bn, lhs_block, rhs_block); + } + + template + EIGEN_DEVICE_FUNC BlockMemHandle allocateSlices( + Device& d, const StorageIndex num_lhs, const StorageIndex num_rhs, + const StorageIndex num_slices, std::vector* lhs_blocks, + std::vector* rhs_blocks) { + return BlockMemAllocator::allocateSlices( + d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_blocks, rhs_blocks); + } + + template + EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) { + BlockMemAllocator::deallocate(d, handle); } - EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE - static void packRhs(RhsScalar* rhsBlock, - const typename RhsMapper::SubMapper& data_mapper, - const StorageIndex depth, const StorageIndex cols) { - RhsPacker()(rhsBlock, data_mapper, depth, cols); + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs( + LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex rows) { + LhsPacker()(*lhsBlock, data_mapper, depth, rows, /*stride*/ 0, + /*offset*/ 0); } - EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE - static void invoke(const OutputMapper& output_mapper, - const LhsScalar* lhsBlock, const RhsScalar* rhsBlock, - const StorageIndex rows, const StorageIndex depth, - const StorageIndex cols, const ResScalar alpha) { + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs( + RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex cols) { + RhsPacker()(*rhsBlock, data_mapper, depth, cols); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke( + const OutputMapper& output_mapper, const LhsBlock& lhsBlock, + const RhsBlock& rhsBlock, const StorageIndex rows, + const StorageIndex depth, const StorageIndex cols, + const ResScalar alpha) { + static const int kComputeStrideFromBlockDimensions = -1; GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha, - /*strideA*/ -1, /*strideB*/ -1, + /*strideA*/ kComputeStrideFromBlockDimensions, + /*strideB*/ kComputeStrideFromBlockDimensions, /*offsetA*/ 0, /*offsetB*/ 0); } + + private: + // These are dimensions of the original Tensors, and selected block sizes. The + // actual block sizes passed to all function above might be smaller because of + // the partial blocks at the end. + const StorageIndex m; + const StorageIndex k; + const StorageIndex n; + const StorageIndex bm; + const StorageIndex bk; + const StorageIndex bn; }; } // end namespace internal @@ -257,7 +377,7 @@ class TensorContractionOp : public TensorBase::Scalar Scalar; typedef typename internal::gebp_traits::ResScalar CoeffReturnType; + typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -340,10 +460,10 @@ struct TensorContractionEvaluatorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device) - : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), + : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.lhsExpression(), op.rhsExpression()), device), - m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), - op.rhsExpression(), op.lhsExpression()), device), + m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), + op.rhsExpression(), op.lhsExpression()), device), m_device(device), m_output_kernel(op.outputKernel()), m_result(NULL) { @@ -737,11 +857,18 @@ struct TensorContractionEvaluatorBase const Index kc = blocking.kc(); const Index mc = numext::mini(m, blocking.mc()); const Index nc = numext::mini(n, blocking.nc()); - const Index sizeA = mc * kc; - const Index sizeB = kc * nc; - LhsScalar* blockA = static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar))); - RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); + typedef typename TensorContractionKernel::LhsBlock LhsBlock; + typedef typename TensorContractionKernel::RhsBlock RhsBlock; + + LhsBlock blockA; + RhsBlock blockB; + + TensorContractionKernel kernel(m, k_slice, n, mc, kc, nc); + + typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle; + const BlockMemHandle packed_mem = + kernel.allocate(this->m_device, &blockA, &blockB); for(Index i2=0; i2= k_end) { @@ -775,8 +900,7 @@ struct TensorContractionEvaluatorBase } } - this->m_device.deallocate(blockA); - this->m_device.deallocate(blockB); + kernel.deallocate(this->m_device, packed_mem); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { -- cgit v1.2.3