From 83c0a16baf5ecac6288cd9b74536a82de8985b31 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 31 Jul 2018 15:56:31 -0700 Subject: Add block evaluation support to TensorOps --- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 144 +++++++++++++++++++-- 1 file changed, 136 insertions(+), 8 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 085c05f3d..a0d039e64 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -144,14 +144,19 @@ struct TensorEvaluator, Device> enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets. - IsAligned = false, + IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using InputTensorBlock = internal::TensorBlock; + using OutputTensorBlock = internal::TensorBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset()) { @@ -184,6 +189,23 @@ struct TensorEvaluator, Device> } m_inputStride *= input_dims[m_dim.actualDim()]; m_inputOffset = m_stride * op.offset(); + + if (BlockAccess) { + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStrides[NumInputDims - 1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + } + } + + m_block_total_size_max = + numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -266,6 +288,60 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, cost, vectorized, PacketSize); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kSkewedInnerDims, + m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + // TODO(andydavis) Reduce the overhead of this function (experiment with + // using a fixed block size). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + OutputTensorBlock* output_block) const { + // Calculate input block sizes. + const DSizes& output_block_sizes = + output_block->block_sizes(); + const DSizes& output_block_strides = + output_block->block_strides(); + const Index chip_dim = m_dim.actualDim(); + DSizes input_block_sizes; + DSizes input_block_strides; + for (Index i = 0; i < NumInputDims; ++i) { + if (i < chip_dim) { + input_block_sizes[i] = output_block_sizes[i]; + input_block_strides[i] = output_block_strides[i]; + } else if (i > chip_dim) { + input_block_sizes[i] = output_block_sizes[i - 1]; + input_block_strides[i] = output_block_strides[i - 1]; + } else { + input_block_sizes[i] = 1; + } + } + // Fix up input_block_stride for chip dimension. + if (static_cast(Layout) == static_cast(ColMajor)) { + if (chip_dim == 0) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = + input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1]; + } + } else { + if (chip_dim == NumInputDims - 1) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = + input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1]; + } + } + // Instantiate and read input block from input tensor. + InputTensorBlock input_block(srcCoeff(output_block->first_coeff_index()), + input_block_sizes, input_block_strides, + m_inputStrides, output_block->data()); + m_impl.block(&input_block); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { CoeffReturnType* result = const_cast(m_impl.data()); if (((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumDims) || @@ -316,6 +392,8 @@ struct TensorEvaluator, Device> Index m_stride; Index m_inputOffset; Index m_inputStride; + Index m_block_total_size_max; + DSizes m_inputStrides; TensorEvaluator m_impl; const internal::DimensionId m_dim; const Device& m_device; @@ -342,12 +420,18 @@ struct TensorEvaluator, Device> static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using InputTensorBlock = internal::TensorBlock; + using OutputTensorBlock = internal::TensorBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -395,6 +479,50 @@ struct TensorEvaluator, Device> } } } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const OutputTensorBlock& output_block) { + // Calculate input block sizes. + const DSizes& output_block_sizes = + output_block.block_sizes(); + const DSizes& output_block_strides = + output_block.block_strides(); + const Index chip_dim = this->m_dim.actualDim(); + DSizes input_block_sizes; + DSizes input_block_strides; + for (Index i = 0; i < NumInputDims; ++i) { + if (i < chip_dim) { + input_block_sizes[i] = output_block_sizes[i]; + input_block_strides[i] = output_block_strides[i]; + } else if (i > chip_dim) { + input_block_sizes[i] = output_block_sizes[i - 1]; + input_block_strides[i] = output_block_strides[i - 1]; + } else { + input_block_sizes[i] = 1; + } + } + // Fix up input_block_stride for chip dimension. + if (static_cast(Layout) == static_cast(ColMajor)) { + if (chip_dim == 0) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = + input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1]; + } + } else { + if (chip_dim == NumInputDims - 1) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = + input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1]; + } + } + // Write input block. + this->m_impl.writeBlock(InputTensorBlock( + this->srcCoeff(output_block.first_coeff_index()), input_block_sizes, + input_block_strides, this->m_inputStrides, + const_cast(output_block.data()))); + } }; -- cgit v1.2.3