aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
diff options
context:
space:
mode:
authorGravatar Eugene Zhulenev <ezhulenev@google.com>2018-07-31 15:56:31 -0700
committerGravatar Eugene Zhulenev <ezhulenev@google.com>2018-07-31 15:56:31 -0700
commit83c0a16baf5ecac6288cd9b74536a82de8985b31 (patch)
treefc284a18d1a457eecfb2ce4aeda26bd0c78f2056 /unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
parentd6568425f8b384ef36da1f16be76a3977fb90846 (diff)
Add block evaluation support to TensorOps
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h144
1 files changed, 136 insertions, 8 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 085c05f3d..a0d039e64 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -144,14 +144,19 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
enum {
// Alignment can't be guaranteed at compile time since it depends on the
// slice offsets.
- IsAligned = false,
+ IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = false
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
};
+ using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+
+ using InputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
+ using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset())
{
@@ -184,6 +189,23 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
}
m_inputStride *= input_dims[m_dim.actualDim()];
m_inputOffset = m_stride * op.offset();
+
+ if (BlockAccess) {
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_inputStrides[0] = 1;
+ for (int i = 1; i < NumInputDims; ++i) {
+ m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+ }
+ } else {
+ m_inputStrides[NumInputDims - 1] = 1;
+ for (int i = NumInputDims - 2; i >= 0; --i) {
+ m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+ }
+ }
+
+ m_block_total_size_max =
+ numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
+ }
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -266,6 +288,60 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
TensorOpCost(0, 0, cost, vectorized, PacketSize);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ resources->push_back(internal::TensorOpResourceRequirements(
+ internal::TensorBlockShapeType::kSkewedInnerDims,
+ m_block_total_size_max));
+ m_impl.getResourceRequirements(resources);
+ }
+
+ // TODO(andydavis) Reduce the overhead of this function (experiment with
+ // using a fixed block size).
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+ OutputTensorBlock* output_block) const {
+ // Calculate input block sizes.
+ const DSizes<Index, NumDims>& output_block_sizes =
+ output_block->block_sizes();
+ const DSizes<Index, NumDims>& output_block_strides =
+ output_block->block_strides();
+ const Index chip_dim = m_dim.actualDim();
+ DSizes<Index, NumInputDims> input_block_sizes;
+ DSizes<Index, NumInputDims> input_block_strides;
+ for (Index i = 0; i < NumInputDims; ++i) {
+ if (i < chip_dim) {
+ input_block_sizes[i] = output_block_sizes[i];
+ input_block_strides[i] = output_block_strides[i];
+ } else if (i > chip_dim) {
+ input_block_sizes[i] = output_block_sizes[i - 1];
+ input_block_strides[i] = output_block_strides[i - 1];
+ } else {
+ input_block_sizes[i] = 1;
+ }
+ }
+ // Fix up input_block_stride for chip dimension.
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ if (chip_dim == 0) {
+ input_block_strides[chip_dim] = 1;
+ } else {
+ input_block_strides[chip_dim] =
+ input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1];
+ }
+ } else {
+ if (chip_dim == NumInputDims - 1) {
+ input_block_strides[chip_dim] = 1;
+ } else {
+ input_block_strides[chip_dim] =
+ input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1];
+ }
+ }
+ // Instantiate and read input block from input tensor.
+ InputTensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
+ input_block_sizes, input_block_strides,
+ m_inputStrides, output_block->data());
+ m_impl.block(&input_block);
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits<XprType>::PointerType data() const {
CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) ||
@@ -316,6 +392,8 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
Index m_stride;
Index m_inputOffset;
Index m_inputStride;
+ Index m_block_total_size_max;
+ DSizes<Index, NumInputDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
const internal::DimensionId<DimId> m_dim;
const Device& m_device;
@@ -342,12 +420,18 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
- IsAligned = false,
+ IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
- RawAccess = false
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ RawAccess = false
};
+ using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+
+ using InputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
+ using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>;
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
@@ -395,6 +479,50 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
}
}
}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+ const OutputTensorBlock& output_block) {
+ // Calculate input block sizes.
+ const DSizes<Index, NumDims>& output_block_sizes =
+ output_block.block_sizes();
+ const DSizes<Index, NumDims>& output_block_strides =
+ output_block.block_strides();
+ const Index chip_dim = this->m_dim.actualDim();
+ DSizes<Index, NumInputDims> input_block_sizes;
+ DSizes<Index, NumInputDims> input_block_strides;
+ for (Index i = 0; i < NumInputDims; ++i) {
+ if (i < chip_dim) {
+ input_block_sizes[i] = output_block_sizes[i];
+ input_block_strides[i] = output_block_strides[i];
+ } else if (i > chip_dim) {
+ input_block_sizes[i] = output_block_sizes[i - 1];
+ input_block_strides[i] = output_block_strides[i - 1];
+ } else {
+ input_block_sizes[i] = 1;
+ }
+ }
+ // Fix up input_block_stride for chip dimension.
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ if (chip_dim == 0) {
+ input_block_strides[chip_dim] = 1;
+ } else {
+ input_block_strides[chip_dim] =
+ input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1];
+ }
+ } else {
+ if (chip_dim == NumInputDims - 1) {
+ input_block_strides[chip_dim] = 1;
+ } else {
+ input_block_strides[chip_dim] =
+ input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1];
+ }
+ }
+ // Write input block.
+ this->m_impl.writeBlock(InputTensorBlock(
+ this->srcCoeff(output_block.first_coeff_index()), input_block_sizes,
+ input_block_strides, this->m_inputStrides,
+ const_cast<ScalarNoConst*>(output_block.data())));
+ }
};