diff options
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 144 |
1 files changed, 136 insertions, 8 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 3ab0a0f49..7579ab507 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -144,14 +144,19 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets. - IsAligned = false, + IsAligned = false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const<Scalar>::type; + + using InputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>; + using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset()) { @@ -184,6 +189,23 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> } m_inputStride *= input_dims[m_dim.actualDim()]; m_inputOffset = m_stride * op.offset(); + + if (BlockAccess) { + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + m_inputStrides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStrides[NumInputDims - 1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + } + } + + m_block_total_size_max = + numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar)); + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -266,6 +288,60 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> TensorOpCost(0, 0, cost, vectorized, PacketSize); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kSkewedInnerDims, + m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + // TODO(andydavis) Reduce the overhead of this function (experiment with + // using a fixed block size). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + OutputTensorBlock* output_block) const { + // Calculate input block sizes. + const DSizes<Index, NumDims>& output_block_sizes = + output_block->block_sizes(); + const DSizes<Index, NumDims>& output_block_strides = + output_block->block_strides(); + const Index chip_dim = m_dim.actualDim(); + DSizes<Index, NumInputDims> input_block_sizes; + DSizes<Index, NumInputDims> input_block_strides; + for (Index i = 0; i < NumInputDims; ++i) { + if (i < chip_dim) { + input_block_sizes[i] = output_block_sizes[i]; + input_block_strides[i] = output_block_strides[i]; + } else if (i > chip_dim) { + input_block_sizes[i] = output_block_sizes[i - 1]; + input_block_strides[i] = output_block_strides[i - 1]; + } else { + input_block_sizes[i] = 1; + } + } + // Fix up input_block_stride for chip dimension. + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + if (chip_dim == 0) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = + input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1]; + } + } else { + if (chip_dim == NumInputDims - 1) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = + input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1]; + } + } + // Instantiate and read input block from input tensor. + InputTensorBlock input_block(srcCoeff(output_block->first_coeff_index()), + input_block_sizes, input_block_strides, + m_inputStrides, output_block->data()); + m_impl.block(&input_block); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const { CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data()); if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) || @@ -316,6 +392,8 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> Index m_stride; Index m_inputOffset; Index m_inputStride; + Index m_block_total_size_max; + DSizes<Index, NumInputDims> m_inputStrides; TensorEvaluator<ArgType, Device> m_impl; const internal::DimensionId<DimId> m_dim; const Device& m_device; @@ -342,12 +420,18 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> static const int PacketSize = PacketType<CoeffReturnType, Device>::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = false, - RawAccess = false + BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const<Scalar>::type; + + using InputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>; + using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -395,6 +479,50 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> } } } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const OutputTensorBlock& output_block) { + // Calculate input block sizes. + const DSizes<Index, NumDims>& output_block_sizes = + output_block.block_sizes(); + const DSizes<Index, NumDims>& output_block_strides = + output_block.block_strides(); + const Index chip_dim = this->m_dim.actualDim(); + DSizes<Index, NumInputDims> input_block_sizes; + DSizes<Index, NumInputDims> input_block_strides; + for (Index i = 0; i < NumInputDims; ++i) { + if (i < chip_dim) { + input_block_sizes[i] = output_block_sizes[i]; + input_block_strides[i] = output_block_strides[i]; + } else if (i > chip_dim) { + input_block_sizes[i] = output_block_sizes[i - 1]; + input_block_strides[i] = output_block_strides[i - 1]; + } else { + input_block_sizes[i] = 1; + } + } + // Fix up input_block_stride for chip dimension. + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + if (chip_dim == 0) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = + input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1]; + } + } else { + if (chip_dim == NumInputDims - 1) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = + input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1]; + } + } + // Write input block. + this->m_impl.writeBlock(InputTensorBlock( + this->srcCoeff(output_block.first_coeff_index()), input_block_sizes, + input_block_strides, this->m_inputStrides, + const_cast<ScalarNoConst*>(output_block.data()))); + } }; |