diff options
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 95 |
1 files changed, 76 insertions, 19 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index dc9af3aa8..e89f40213 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -73,14 +73,68 @@ EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides( enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims }; struct TensorBlockResourceRequirements { - TensorBlockShapeType shape_type; - size_t size; + TensorBlockShapeType shape_type; // target block shape + size_t size; // target block size + TensorOpCost cost_per_coeff; // cost of computing a single block element + + template <typename Scalar> + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( + TensorBlockShapeType shape_type, size_t size_in_bytes, + TensorOpCost cost) { + const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar)); + return {shape_type, size, cost}; + } + + template <typename Scalar> + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( + TensorBlockShapeType shape_type, size_t size_in_bytes) { + // This default cost per coefficient is valid for most materialized tensor + // block evaluation implementations, because they typically just read + // coefficients from the underlying tensor storage, and write to the tensor + // block buffer (scratch or destination memory, reads and writes have linear + // access pattern). We ignore the fixed cost of block evaluation, because in + // practice it should negligible. + // + // Lazy block evaluation adds the cost of calling a functor for each + // coefficient. + // + // All non-trivial block evaluation implementations must provide their own + // cost approximation (e.g. shuffling inner dimension has a much higher cost + // because it reads memory randomly, although the total number of moved + // bytes is the same). + return withShapeAndSize<Scalar>(shape_type, size_in_bytes, + {/*bytes_loaded=*/sizeof(Scalar), + /*bytes_stored=*/sizeof(Scalar), + /*compute_cycles=*/0}); + } + + template <typename Scalar> + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed( + size_t size_in_bytes) { + return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims, + size_in_bytes); + } + + template <typename Scalar> + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform( + size_t size_in_bytes) { + return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims, + size_in_bytes); + } EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements - merge(const TensorBlockResourceRequirements &lhs, - const TensorBlockResourceRequirements &rhs) { - return {merge(lhs.shape_type, rhs.shape_type), merge(rhs.size, lhs.size)}; + merge(const TensorBlockResourceRequirements& lhs, + const TensorBlockResourceRequirements& rhs) { + return {merge(lhs.shape_type, rhs.shape_type), // shape_type + merge(lhs.size, rhs.size), // size + merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff + } + + EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff( + TensorOpCost cost) { + cost_per_coeff += cost; + return *this; } // This is a resource requirement that should be returned from expressions @@ -88,10 +142,10 @@ struct TensorBlockResourceRequirements { // expression with raw buffer access). EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() { - return {TensorBlockShapeType::kUniformAllDims, 1}; + return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}}; } -private: + private: using Requirements = TensorBlockResourceRequirements; EIGEN_DEVICE_FUNC @@ -100,13 +154,19 @@ private: } EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs, - TensorBlockShapeType rhs) { + static EIGEN_STRONG_INLINE TensorBlockShapeType + merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) { return (lhs == TensorBlockShapeType::kSkewedInnerDims || rhs == TensorBlockShapeType::kSkewedInnerDims) ? TensorBlockShapeType::kSkewedInnerDims : TensorBlockShapeType::kUniformAllDims; } + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, + TensorOpCost rhs_cost) { + return lhs_cost + rhs_cost; + } }; // -------------------------------------------------------------------------- // @@ -131,8 +191,9 @@ class TensorBlockDescriptor { class DestinationBuffer { public: enum DestinationBufferKind : int { - // The above explicit specification of "int" as the enum basetype is needed - // to get around a HIPCC link error ("the field type is not amp-compatible") + // The above explicit specification of "int" as the enum basetype is + // needed to get around a HIPCC link error ("the field type is not + // amp-compatible") // which is issued for class members with the enum type. // TODO(rocm): // remove the "int" basetype once HIPCC has been fixed to not error out @@ -280,7 +341,7 @@ class TensorBlockMapper { TensorBlockMapper() = default; TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions, - const TensorBlockResourceRequirements& requirements) + const TensorBlockResourceRequirements& requirements) : m_tensor_dimensions(dimensions), m_requirements(requirements) { // Compute block dimensions and the total number of blocks. InitializeBlockDimensions(); @@ -299,8 +360,8 @@ class TensorBlockMapper { return m_block_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - BlockDescriptor blockDescriptor(IndexType block_index) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor + blockDescriptor(IndexType block_index) const { static const bool isColMajor = Layout == static_cast<int>(ColMajor); IndexType offset = 0; @@ -416,7 +477,7 @@ class TensorBlockMapper { eigen_assert(m_block_dimensions.TotalSize() >= numext::mini<IndexType>(target_block_size, - m_tensor_dimensions.TotalSize())); + m_tensor_dimensions.TotalSize())); // Calculate block counts by dimension and total block count. DSizes<IndexType, NumDims> block_count; @@ -761,7 +822,6 @@ class TensorMaterializedBlock { template <typename UnaryOp, typename ArgTensorBlock> class TensorCwiseUnaryBlock { - static const bool NoArgBlockAccess = internal::is_void<typename ArgTensorBlock::XprType>::value; @@ -793,7 +853,6 @@ class TensorCwiseUnaryBlock { template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock> class TensorCwiseBinaryBlock { - static const bool NoArgBlockAccess = internal::is_void<typename LhsTensorBlock::XprType>::value || internal::is_void<typename RhsTensorBlock::XprType>::value; @@ -840,7 +899,6 @@ class TensorCwiseBinaryBlock { template <typename BlockFactory, typename ArgTensorBlock> class TensorUnaryExprBlock { - typedef typename ArgTensorBlock::XprType ArgXprType; static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value; @@ -872,7 +930,6 @@ class TensorUnaryExprBlock { template <typename BlockFactory, typename Arg1TensorBlock, typename Arg2TensorBlock, typename Arg3TensorBlock> class TensorTernaryExprBlock { - typedef typename Arg1TensorBlock::XprType Arg1XprType; typedef typename Arg2TensorBlock::XprType Arg2XprType; typedef typename Arg3TensorBlock::XprType Arg3XprType; |