aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2018-07-31 08:13:00 +0000
committerGravatar Gael Guennebaud <g.gael@free.fr>2018-07-31 08:13:00 +0000
commit678a0dcb12d55e1d85aade7b34c706b2a5d2d49e (patch)
tree72540698831395b25a5f481ed5182ccbee7e0129 /unsupported/Eigen/CXX11/src
parent679eece8760ce9b9ff09e48b6ee8673afcf94caa (diff)
parent966c2a7bb62a8b5b9ecd349730ffcd3b5719837d (diff)
Merged in ezhulenev/eigen/tiling_3 (pull request PR-438)
Tiled tensor executor
Diffstat (limited to 'unsupported/Eigen/CXX11/src')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h44
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h639
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h16
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h98
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h365
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRef.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h7
27 files changed, 1032 insertions, 190 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 027305586..199ddb123 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -68,6 +68,8 @@ class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType>
typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
+ static const int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
: m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
@@ -95,20 +97,33 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
+
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+ static const int NumDims = XprType::NumDims;
enum {
- IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
- PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
- Layout = TensorEvaluator<LeftArgType, Device>::Layout,
- RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
+ IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
+ TensorEvaluator<RightArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
+ TensorEvaluator<RightArgType, Device>::PacketAccess,
+ BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
+ TensorEvaluator<RightArgType, Device>::BlockAccess,
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
};
+ typedef typename internal::TensorBlock<
+ typename internal::remove_const<Scalar>::type, Index, NumDims, Layout>
+ TensorBlock;
+
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
m_leftImpl(op.lhsExpression(), device),
m_rightImpl(op.rhsExpression(), device)
{
- EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT(
+ (static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+ static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+ YOU_MADE_A_PROGRAMMING_MISTAKE);
}
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@@ -164,6 +179,25 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ m_leftImpl.getResourceRequirements(resources);
+ m_rightImpl.getResourceRequirements(resources);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
+ if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
+ m_leftImpl.data() != nullptr) {
+ TensorBlock left_block(block->first_coeff_index(), block->block_sizes(),
+ block->tensor_strides(), block->tensor_strides(),
+ m_leftImpl.data() + block->first_coeff_index());
+ m_rightImpl.block(&left_block);
+ } else {
+ m_rightImpl.block(block);
+ m_leftImpl.writeBlock(*block);
+ }
+ }
+
/// required by sycl in order to extract the accessor
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
/// required by sycl in order to extract the accessor
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index 59535cd91..84cf6d216 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -14,6 +14,32 @@
namespace Eigen {
namespace internal {
+namespace {
+
+// Helper template to choose between ColMajor and RowMajor values.
+template <int Layout>
+struct cond;
+
+template <>
+struct cond<ColMajor> {
+ template <typename T>
+ EIGEN_STRONG_INLINE const T& operator()(const T& col,
+ const T& /*row*/) const {
+ return col;
+ }
+};
+
+template <>
+struct cond<RowMajor> {
+ template <typename T>
+ EIGEN_STRONG_INLINE const T& operator()(const T& /*col*/,
+ const T& row) const {
+ return row;
+ }
+};
+
+} // namespace
+
/**
* \class TensorBlockShapeType
* \ingroup CXX11_Tensor_Module
@@ -39,6 +65,40 @@ enum class TensorBlockShapeType {
kSkewedInnerDims,
};
+struct TensorOpResourceRequirements {
+ TensorBlockShapeType block_shape;
+ Index block_total_size;
+ // TODO(andydavis) Add 'target_num_threads' to support communication of
+ // thread-resource requirements. This will allow ops deep in the
+ // expression tree (like reductions) to communicate resources
+ // requirements based on local state (like the total number of reductions
+ // to be computed).
+ TensorOpResourceRequirements(internal::TensorBlockShapeType shape,
+ const Index size)
+ : block_shape(shape), block_total_size(size) {}
+};
+
+// Tries to merge multiple resource requirements.
+EIGEN_STRONG_INLINE void MergeResourceRequirements(
+ const std::vector<TensorOpResourceRequirements>& resources,
+ TensorBlockShapeType* block_shape, Index* block_total_size) {
+ if (resources.empty()) {
+ return;
+ }
+ // TODO(andydavis) Implement different policies (i.e. revert to a default
+ // policy if block shapes/sizes conflict).
+ *block_shape = resources[0].block_shape;
+ *block_total_size = resources[0].block_total_size;
+ for (int i = 1; i < resources.size(); ++i) {
+ if (resources[i].block_shape == TensorBlockShapeType::kSkewedInnerDims &&
+ *block_shape != TensorBlockShapeType::kSkewedInnerDims) {
+ *block_shape = TensorBlockShapeType::kSkewedInnerDims;
+ }
+ *block_total_size =
+ numext::maxi(*block_total_size, resources[i].block_total_size);
+ }
+}
+
/**
* \class TensorBlock
* \ingroup CXX11_Tensor_Module
@@ -48,12 +108,12 @@ enum class TensorBlockShapeType {
* This class represents a tensor block specified by the index of the
* first block coefficient, and the size of the block in each dimension.
*/
-template <typename Scalar, typename Index, std::size_t NumDims, int Layout>
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
class TensorBlock {
public:
- typedef DSizes<Index, NumDims> Dimensions;
+ typedef DSizes<StorageIndex, NumDims> Dimensions;
- TensorBlock(const Index first_coeff_index, const Dimensions& block_sizes,
+ TensorBlock(const StorageIndex first_coeff_index, const Dimensions& block_sizes,
const Dimensions& block_strides, const Dimensions& tensor_strides,
Scalar* data)
: m_first_coeff_index(first_coeff_index),
@@ -62,7 +122,7 @@ class TensorBlock {
m_tensor_strides(tensor_strides),
m_data(data) {}
- Index first_coeff_index() const { return m_first_coeff_index; }
+ StorageIndex first_coeff_index() const { return m_first_coeff_index; }
const Dimensions& block_sizes() const { return m_block_sizes; }
@@ -75,13 +135,487 @@ class TensorBlock {
const Scalar* data() const { return m_data; }
private:
- Index m_first_coeff_index;
+ StorageIndex m_first_coeff_index;
Dimensions m_block_sizes;
Dimensions m_block_strides;
Dimensions m_tensor_strides;
Scalar* m_data; // Not owned.
};
+template <typename Scalar, typename StorageIndex>
+struct TensorBlockCopyOp {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const StorageIndex num_coeff_to_copy, const StorageIndex dst_index,
+ const StorageIndex dst_stride, Scalar* EIGEN_RESTRICT dst_data,
+ const StorageIndex src_index, const StorageIndex src_stride,
+ const Scalar* EIGEN_RESTRICT src_data) {
+ const Scalar* src_base = &src_data[src_index];
+ Scalar* dst_base = &dst_data[dst_index];
+
+ using Src = const Eigen::Array<Scalar, Dynamic, 1>;
+ using Dst = Eigen::Array<Scalar, Dynamic, 1>;
+
+ using SrcMap = Eigen::Map<Src, 0, InnerStride<>>;
+ using DstMap = Eigen::Map<Dst, 0, InnerStride<>>;
+
+ const SrcMap src(src_base, num_coeff_to_copy, InnerStride<>(src_stride));
+ DstMap dst(dst_base, num_coeff_to_copy, InnerStride<>(dst_stride));
+
+ dst = src;
+ }
+};
+
+/**
+ * \class TensorBlockIO
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor block IO class.
+ *
+ * This class is responsible for copying data between a tensor and a tensor
+ * block.
+ */
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout,
+ bool BlockRead>
+class TensorBlockIO {
+ public:
+ typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
+ TensorBlock;
+ typedef typename internal::TensorBlockCopyOp<Scalar, StorageIndex>
+ TensorBlockCopyOp;
+
+ protected:
+ struct BlockIteratorState {
+ StorageIndex input_stride;
+ StorageIndex output_stride;
+ StorageIndex input_span;
+ StorageIndex output_span;
+ StorageIndex size;
+ StorageIndex count;
+ };
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
+ const TensorBlock& block, StorageIndex first_coeff_index,
+ const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
+ const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data,
+ Scalar* dst_data) {
+ // Find the innermost tensor dimension whose size is not 1. This is the
+ // effective inner dim. If all dimensions are of size 1, then fallback to
+ // using the actual innermost dim to avoid out-of-bound access.
+ StorageIndex num_size_one_inner_dims = 0;
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = cond<Layout>()(i, NumDims - i - 1);
+ if (block.block_sizes()[tensor_to_block_dim_map[dim]] != 1) {
+ num_size_one_inner_dims = i;
+ break;
+ }
+ }
+ // Calculate strides and dimensions.
+ const StorageIndex tensor_stride1_dim = cond<Layout>()(
+ num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1);
+ const StorageIndex block_dim_for_tensor_stride1_dim =
+ NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim];
+ size_t block_inner_dim_size =
+ NumDims == 0 ? 1
+ : block.block_sizes()[block_dim_for_tensor_stride1_dim];
+ for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
+ const int dim = cond<Layout>()(i, NumDims - i - 1);
+ const StorageIndex block_stride =
+ block.block_strides()[tensor_to_block_dim_map[dim]];
+ if (block_inner_dim_size == block_stride &&
+ block_stride == tensor_strides[dim]) {
+ block_inner_dim_size *=
+ block.block_sizes()[tensor_to_block_dim_map[dim]];
+ ++num_size_one_inner_dims;
+ } else {
+ break;
+ }
+ }
+
+ StorageIndex inputIndex;
+ StorageIndex outputIndex;
+ StorageIndex input_stride;
+ StorageIndex output_stride;
+
+ // Setup strides to read/write along the tensor's stride1 dimension.
+ if (BlockRead) {
+ inputIndex = first_coeff_index;
+ outputIndex = 0;
+ input_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim];
+ output_stride =
+ NumDims == 0
+ ? 1
+ : block.block_strides()[block_dim_for_tensor_stride1_dim];
+ } else {
+ inputIndex = 0;
+ outputIndex = first_coeff_index;
+ input_stride =
+ NumDims == 0
+ ? 1
+ : block.block_strides()[block_dim_for_tensor_stride1_dim];
+ output_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim];
+ }
+
+ const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
+ array<BlockIteratorState, at_least_1_dim> block_iter_state;
+
+ // Initialize block iterator state. Squeeze away any dimension of size 1.
+ int num_squeezed_dims = 0;
+ for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
+ const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
+ const StorageIndex size = block.block_sizes()[tensor_to_block_dim_map[dim]];
+ if (size == 1) {
+ continue;
+ }
+ block_iter_state[num_squeezed_dims].size = size;
+ if (BlockRead) {
+ block_iter_state[num_squeezed_dims].input_stride = tensor_strides[dim];
+ block_iter_state[num_squeezed_dims].output_stride =
+ block.block_strides()[tensor_to_block_dim_map[dim]];
+ } else {
+ block_iter_state[num_squeezed_dims].input_stride =
+ block.block_strides()[tensor_to_block_dim_map[dim]];
+ block_iter_state[num_squeezed_dims].output_stride = tensor_strides[dim];
+ }
+ block_iter_state[num_squeezed_dims].input_span =
+ block_iter_state[num_squeezed_dims].input_stride *
+ (block_iter_state[num_squeezed_dims].size - 1);
+ block_iter_state[num_squeezed_dims].output_span =
+ block_iter_state[num_squeezed_dims].output_stride *
+ (block_iter_state[num_squeezed_dims].size - 1);
+ block_iter_state[num_squeezed_dims].count = 0;
+ ++num_squeezed_dims;
+ }
+
+ // Iterate copying data from src to dst.
+ const StorageIndex block_total_size =
+ NumDims == 0 ? 1 : block.block_sizes().TotalSize();
+ for (StorageIndex i = 0; i < block_total_size; i += block_inner_dim_size) {
+ TensorBlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride,
+ dst_data, inputIndex, input_stride, src_data);
+ // Update index.
+ for (int j = 0; j < num_squeezed_dims; ++j) {
+ if (++block_iter_state[j].count < block_iter_state[j].size) {
+ inputIndex += block_iter_state[j].input_stride;
+ outputIndex += block_iter_state[j].output_stride;
+ break;
+ }
+ block_iter_state[j].count = 0;
+ inputIndex -= block_iter_state[j].input_span;
+ outputIndex -= block_iter_state[j].output_span;
+ }
+ }
+ }
+};
+
+/**
+ * \class TensorBlockReader
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor block reader class.
+ *
+ * This class is responsible for reading a tensor block.
+ *
+ */
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
+class TensorBlockReader : public TensorBlockIO<Scalar, StorageIndex, NumDims,
+ Layout, /*BlockRead=*/true> {
+ public:
+ typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
+ TensorBlock;
+ typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/true>
+ Base;
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ TensorBlock* block, const Scalar* src_data) {
+ array<StorageIndex, NumDims> tensor_to_block_dim_map;
+ for (int i = 0; i < NumDims; ++i) {
+ tensor_to_block_dim_map[i] = i;
+ }
+ Base::Copy(*block, block->first_coeff_index(), tensor_to_block_dim_map,
+ block->tensor_strides(), src_data, block->data());
+ }
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ TensorBlock* block, StorageIndex first_coeff_index,
+ const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
+ const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data) {
+ Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map,
+ tensor_strides, src_data, block->data());
+ }
+};
+
+/**
+ * \class TensorBlockWriter
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor block writer class.
+ *
+ * This class is responsible for writing a tensor block.
+ *
+ */
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
+class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims,
+ Layout, /*BlockRead=*/false> {
+ public:
+ typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
+ TensorBlock;
+ typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/false>
+ Base;
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const TensorBlock& block, Scalar* dst_data) {
+ array<StorageIndex, NumDims> tensor_to_block_dim_map;
+ for (int i = 0; i < NumDims; ++i) {
+ tensor_to_block_dim_map[i] = i;
+ }
+ Base::Copy(block, block.first_coeff_index(), tensor_to_block_dim_map,
+ block.tensor_strides(), block.data(), dst_data);
+ }
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const TensorBlock& block, StorageIndex first_coeff_index,
+ const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
+ const array<StorageIndex, NumDims>& tensor_strides, Scalar* dst_data) {
+ Base::Copy(block, first_coeff_index, tensor_to_block_dim_map,
+ tensor_strides, block.data(), dst_data);
+ }
+};
+
+/**
+ * \class TensorBlockCwiseBinaryOp
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Carries out a cwise binary op on a number of coefficients.
+ *
+ * This class reads strided inputs from left and right operands, and writes the
+ * result of the cwise binary op to the strided output array.
+ *
+ */
+struct TensorBlockCwiseBinaryOp {
+ template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
+ typename LeftScalar, typename RightScalar>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const BinaryFunctor& functor, const StorageIndex num_coeff,
+ const StorageIndex output_index, const StorageIndex output_stride,
+ OutputScalar* output_data, const StorageIndex left_index,
+ const StorageIndex left_stride, const LeftScalar* left_data,
+ const StorageIndex right_index, const StorageIndex right_stride,
+ const RightScalar* right_data) {
+ using Lhs = const Eigen::Array<LeftScalar, Dynamic, 1>;
+ using Rhs = const Eigen::Array<RightScalar, Dynamic, 1>;
+ using Out = Eigen::Array<OutputScalar, Dynamic, 1>;
+
+ using LhsMap = Eigen::Map<Lhs, 0, InnerStride<>>;
+ using RhsMap = Eigen::Map<Rhs, 0, InnerStride<>>;
+ using OutMap = Eigen::Map<Out, 0, InnerStride<>>;
+
+ const LeftScalar* lhs_base = &left_data[left_index];
+ const RightScalar* rhs_base = &right_data[right_index];
+ OutputScalar* out_base = &output_data[output_index];
+
+ const LhsMap lhs(lhs_base, num_coeff, InnerStride<>(left_stride));
+ const RhsMap rhs(rhs_base, num_coeff, InnerStride<>(right_stride));
+ OutMap out(out_base, num_coeff, InnerStride<>(output_stride));
+
+ out =
+ Eigen::CwiseBinaryOp<BinaryFunctor, LhsMap, RhsMap>(lhs, rhs, functor);
+ }
+};
+
+/**
+ * \class TensorBlockCwiseBinaryIO
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor block IO class for carrying out cwise binary ops.
+ *
+ * This class carries out the binary op on given blocks.
+ *
+ */
+template <typename BinaryFunctor, typename StorageIndex, typename OutputScalar,
+ int NumDims, int Layout>
+struct TensorBlockCwiseBinaryIO {
+ typedef typename internal::TensorBlock<OutputScalar, StorageIndex, NumDims,
+ Layout>::Dimensions Dimensions;
+
+ struct BlockIteratorState {
+ StorageIndex output_stride, output_span;
+ StorageIndex left_stride, left_span;
+ StorageIndex right_stride, right_span;
+ StorageIndex size, count;
+ };
+
+ template <typename LeftScalar, typename RightScalar>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const BinaryFunctor& functor, const Dimensions& block_sizes,
+ const Dimensions& block_strides, OutputScalar* output_data,
+ const array<StorageIndex, NumDims>& left_strides,
+ const LeftScalar* left_data,
+ const array<StorageIndex, NumDims>& right_strides,
+ const RightScalar* right_data) {
+ // Find the innermost dimension whose size is not 1. This is the effective
+ // inner dim. If all dimensions are of size 1, fallback to using the actual
+ // innermost dim to avoid out-of-bound access.
+ int num_size_one_inner_dims = 0;
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = cond<Layout>()(i, NumDims - i - 1);
+ if (block_sizes[dim] != 1) {
+ num_size_one_inner_dims = i;
+ break;
+ }
+ }
+ // Calculate strides and dimensions.
+ const int inner_dim =
+ NumDims == 0 ? 1
+ : cond<Layout>()(num_size_one_inner_dims,
+ NumDims - num_size_one_inner_dims - 1);
+ StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
+ for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
+ const int dim = cond<Layout>()(i, NumDims - i - 1);
+ // Merge multiple inner dims into one for larger inner dim size (i.e.
+ // fewer calls to TensorBlockCwiseBinaryOp::Run()).
+ if (inner_dim_size == block_strides[dim] &&
+ block_strides[dim] == left_strides[dim] &&
+ block_strides[dim] == right_strides[dim]) {
+ inner_dim_size *= block_sizes[dim];
+ ++num_size_one_inner_dims;
+ } else {
+ break;
+ }
+ }
+
+ StorageIndex output_index = 0, left_index = 0, right_index = 0;
+ const StorageIndex output_stride =
+ NumDims == 0 ? 1 : block_strides[inner_dim];
+ const StorageIndex left_stride = NumDims == 0 ? 1 : left_strides[inner_dim];
+ const StorageIndex right_stride =
+ NumDims == 0 ? 1 : right_strides[inner_dim];
+
+ const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
+ array<BlockIteratorState, at_least_1_dim> block_iter_state;
+
+ // Initialize block iterator state. Squeeze away any dimension of size 1.
+ int num_squeezed_dims = 0;
+ for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
+ const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
+ const StorageIndex size = block_sizes[dim];
+ if (size == 1) {
+ continue;
+ }
+ auto& state = block_iter_state[num_squeezed_dims];
+ state.output_stride = block_strides[dim];
+ state.left_stride = left_strides[dim];
+ state.right_stride = right_strides[dim];
+ state.size = size;
+ state.output_span = state.output_stride * (size - 1);
+ state.left_span = state.left_stride * (size - 1);
+ state.right_span = state.right_stride * (size - 1);
+ state.count = 0;
+ ++num_squeezed_dims;
+ }
+
+ // Compute cwise binary op.
+ const StorageIndex block_total_size =
+ NumDims == 0 ? 1 : block_sizes.TotalSize();
+ for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
+ TensorBlockCwiseBinaryOp::Run(functor, inner_dim_size, output_index,
+ output_stride, output_data, left_index,
+ left_stride, left_data, right_index,
+ right_stride, right_data);
+ // Update index.
+ for (int j = 0; j < num_squeezed_dims; ++j) {
+ auto& state = block_iter_state[j];
+ if (++state.count < state.size) {
+ output_index += state.output_stride;
+ left_index += state.left_stride;
+ right_index += state.right_stride;
+ break;
+ }
+ state.count = 0;
+ output_index -= state.output_span;
+ left_index -= state.left_span;
+ right_index -= state.right_span;
+ }
+ }
+ }
+};
+
+/**
+ * \class TensorBlockView
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Read-only view into a block of data.
+ *
+ * This class provides read-only access to a block of data in impl. It may need
+ * to allocate space for holding the intermediate result.
+ *
+ */
+template <class ArgType, class Device>
+struct TensorBlockView {
+ typedef TensorEvaluator<ArgType, Device> Impl;
+ typedef typename Impl::Index StorageIndex;
+ typedef typename remove_const<typename Impl::Scalar>::type Scalar;
+ static const int NumDims = array_size<typename Impl::Dimensions>::value;
+ typedef DSizes<StorageIndex, NumDims> Dimensions;
+
+ // Constructs a TensorBlockView for `impl`. `block` is only used for for
+ // specifying the start offset, shape, and strides of the block.
+ template <typename OtherTensorBlock>
+ TensorBlockView(const Device& device,
+ const TensorEvaluator<ArgType, Device>& impl,
+ const OtherTensorBlock& block)
+ : m_device(device),
+ m_block_sizes(block.block_sizes()),
+ m_data(NULL),
+ m_allocated_data(NULL) {
+ if (Impl::RawAccess && impl.data() != NULL) {
+ m_data = impl.data() + block.first_coeff_index();
+ m_block_strides = block.tensor_strides();
+ } else {
+ // Actually make a copy.
+
+ // TODO(wuke): This sometimes put a lot pressure on the heap allocator.
+ // Consider allowing ops to request additional temporary block memory in
+ // TensorOpResourceRequirements.
+ m_allocated_data = static_cast<Scalar*>(
+ m_device.allocate(m_block_sizes.TotalSize() * sizeof(Scalar)));
+ m_data = m_allocated_data;
+ if (NumDims > 0) {
+ if (static_cast<int>(Impl::Layout) == static_cast<int>(ColMajor)) {
+ m_block_strides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_block_strides[i] = m_block_strides[i - 1] * m_block_sizes[i - 1];
+ }
+ } else {
+ m_block_strides[NumDims - 1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_block_strides[i] = m_block_strides[i + 1] * m_block_sizes[i + 1];
+ }
+ }
+ }
+ TensorBlock<Scalar, StorageIndex, NumDims, Impl::Layout> input_block(
+ block.first_coeff_index(), m_block_sizes, m_block_strides,
+ block.tensor_strides(), m_allocated_data);
+ impl.block(&input_block);
+ }
+ }
+
+ ~TensorBlockView() {
+ if (m_allocated_data != NULL) {
+ m_device.deallocate(m_allocated_data);
+ }
+ }
+
+ const Dimensions& block_sizes() const { return m_block_sizes; }
+ const Dimensions& block_strides() const { return m_block_strides; }
+ const Scalar* data() const { return m_data; }
+
+ private:
+ const Device& m_device;
+ Dimensions m_block_sizes, m_block_strides;
+ const Scalar* m_data; // Not owned.
+ Scalar* m_allocated_data; // Owned.
+};
+
/**
* \class TensorBlockMapper
* \ingroup CXX11_Tensor_Module
@@ -90,21 +624,21 @@ class TensorBlock {
*
* This class is responsible for iterating over the blocks of a tensor.
*/
-template <typename Scalar, typename Index, std::size_t NumDims, int Layout>
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
class TensorBlockMapper {
public:
- typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout>
+ typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
TensorBlock;
- typedef DSizes<Index, NumDims> Dimensions;
+ typedef DSizes<StorageIndex, NumDims> Dimensions;
TensorBlockMapper(const Dimensions& dims,
const TensorBlockShapeType block_shape,
- size_t min_target_size)
+ Index min_target_size)
: m_dimensions(dims),
m_block_dim_sizes(BlockDimensions(dims, block_shape, min_target_size)) {
// Calculate block counts by dimension and total block count.
- DSizes<Index, NumDims> block_count;
- for (size_t i = 0; i < block_count.rank(); ++i) {
+ DSizes<StorageIndex, NumDims> block_count;
+ for (Index i = 0; i < block_count.rank(); ++i) {
block_count[i] = divup(m_dimensions[i], m_block_dim_sizes[i]);
}
m_total_block_count = array_prod(block_count);
@@ -130,15 +664,15 @@ class TensorBlockMapper {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
- GetBlockForIndex(Index block_index, Scalar* data) const {
- Index first_coeff_index = 0;
- DSizes<Index, NumDims> coords;
- DSizes<Index, NumDims> sizes;
- DSizes<Index, NumDims> strides;
+ GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
+ StorageIndex first_coeff_index = 0;
+ DSizes<StorageIndex, NumDims> coords;
+ DSizes<StorageIndex, NumDims> sizes;
+ DSizes<StorageIndex, NumDims> strides;
if (NumDims > 0) {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
- const Index idx = block_index / m_block_strides[i];
+ const StorageIndex idx = block_index / m_block_strides[i];
coords[i] = idx * m_block_dim_sizes[i];
sizes[i] =
numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
@@ -156,7 +690,7 @@ class TensorBlockMapper {
}
} else {
for (int i = 0; i < NumDims - 1; ++i) {
- const Index idx = block_index / m_block_strides[i];
+ const StorageIndex idx = block_index / m_block_strides[i];
coords[i] = idx * m_block_dim_sizes[i];
sizes[i] =
numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
@@ -181,23 +715,20 @@ class TensorBlockMapper {
data);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
return m_total_block_count;
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index block_dims_total_size() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex
+ block_dims_total_size() const {
return m_block_dim_sizes.TotalSize();
}
private:
- static int InnerDimIndex(Index i) {
- return Layout == static_cast<int>(ColMajor) ? i : NumDims - i - 1;
- }
-
static Dimensions BlockDimensions(const Dimensions& tensor_dims,
const TensorBlockShapeType block_shape,
- size_t min_target_size) {
- min_target_size = numext::maxi<size_t>(1, min_target_size);
+ Index min_target_size) {
+ min_target_size = numext::maxi<Index>(1, min_target_size);
// If tensor fully fits into the target size, we'll treat it a single block.
Dimensions block_dim_sizes = tensor_dims;
@@ -226,14 +757,14 @@ class TensorBlockMapper {
dim_size_target, static_cast<size_t>(tensor_dims[i]));
}
// Add any un-allocated coefficients to inner dimension(s).
- Index total_size = block_dim_sizes.TotalSize();
+ StorageIndex total_size = block_dim_sizes.TotalSize();
for (int i = 0; i < NumDims; ++i) {
- const int dim = InnerDimIndex(i);
+ const int dim = cond<Layout>()(i, NumDims - i - 1);
if (block_dim_sizes[dim] < tensor_dims[dim]) {
- const Index total_size_other_dims =
+ const StorageIndex total_size_other_dims =
total_size / block_dim_sizes[dim];
- const Index alloc_avail =
- divup<Index>(min_target_size, total_size_other_dims);
+ const StorageIndex alloc_avail =
+ divup<StorageIndex>(min_target_size, total_size_other_dims);
if (alloc_avail == block_dim_sizes[dim]) {
// Insufficient excess coefficients to allocate.
break;
@@ -243,14 +774,14 @@ class TensorBlockMapper {
}
}
} else if (block_shape == TensorBlockShapeType::kSkewedInnerDims) {
- Index coeff_to_allocate = min_target_size;
+ StorageIndex coeff_to_allocate = min_target_size;
for (int i = 0; i < NumDims; ++i) {
- const int dim = InnerDimIndex(i);
+ const int dim = cond<Layout>()(i, NumDims - i - 1);
block_dim_sizes[dim] =
numext::mini(coeff_to_allocate, tensor_dims[dim]);
- coeff_to_allocate =
- divup(coeff_to_allocate,
- numext::maxi(static_cast<Index>(1), block_dim_sizes[dim]));
+ coeff_to_allocate = divup(
+ coeff_to_allocate,
+ numext::maxi(static_cast<StorageIndex>(1), block_dim_sizes[dim]));
}
eigen_assert(coeff_to_allocate == 1);
} else {
@@ -269,7 +800,7 @@ class TensorBlockMapper {
Dimensions m_block_dim_sizes;
Dimensions m_block_strides;
Dimensions m_tensor_strides;
- Index m_total_block_count;
+ StorageIndex m_total_block_count;
};
/**
@@ -284,12 +815,12 @@ class TensorBlockMapper {
* processed together.
*
*/
-template <typename Scalar, typename Index, std::size_t NumDims, int Layout>
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
class TensorSliceBlockMapper {
public:
- typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout>
+ typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
TensorBlock;
- typedef DSizes<Index, NumDims> Dimensions;
+ typedef DSizes<StorageIndex, NumDims> Dimensions;
TensorSliceBlockMapper(const Dimensions& tensor_dims,
const Dimensions& tensor_slice_offsets,
@@ -303,7 +834,7 @@ class TensorSliceBlockMapper {
m_block_stride_order(block_stride_order),
m_total_block_count(1) {
// Calculate block counts by dimension and total block count.
- DSizes<Index, NumDims> block_count;
+ DSizes<StorageIndex, NumDims> block_count;
for (size_t i = 0; i < block_count.rank(); ++i) {
block_count[i] = divup(m_tensor_slice_extents[i], m_block_dim_sizes[i]);
}
@@ -330,11 +861,11 @@ class TensorSliceBlockMapper {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
- GetBlockForIndex(Index block_index, Scalar* data) const {
- Index first_coeff_index = 0;
- DSizes<Index, NumDims> coords;
- DSizes<Index, NumDims> sizes;
- DSizes<Index, NumDims> strides;
+ GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
+ StorageIndex first_coeff_index = 0;
+ DSizes<StorageIndex, NumDims> coords;
+ DSizes<StorageIndex, NumDims> sizes;
+ DSizes<StorageIndex, NumDims> strides;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = block_index / m_block_strides[i];
@@ -352,16 +883,16 @@ class TensorSliceBlockMapper {
m_block_dim_sizes[0]);
first_coeff_index += coords[0] * m_tensor_strides[0];
- Index prev_dim = m_block_stride_order[0];
+ StorageIndex prev_dim = m_block_stride_order[0];
strides[prev_dim] = 1;
for (int i = 1; i < NumDims; ++i) {
- const Index curr_dim = m_block_stride_order[i];
+ const StorageIndex curr_dim = m_block_stride_order[i];
strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
prev_dim = curr_dim;
}
} else {
- for (int i = 0; i < static_cast<int>(NumDims) - 1; ++i) {
- const Index idx = block_index / m_block_strides[i];
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const StorageIndex idx = block_index / m_block_strides[i];
coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
sizes[i] = numext::mini(
m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
@@ -377,10 +908,10 @@ class TensorSliceBlockMapper {
m_block_dim_sizes[NumDims - 1]);
first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1];
- Index prev_dim = m_block_stride_order[NumDims - 1];
+ StorageIndex prev_dim = m_block_stride_order[NumDims - 1];
strides[prev_dim] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
- const Index curr_dim = m_block_stride_order[i];
+ const StorageIndex curr_dim = m_block_stride_order[i];
strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
prev_dim = curr_dim;
}
@@ -390,7 +921,7 @@ class TensorSliceBlockMapper {
data);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
return m_total_block_count;
}
@@ -402,7 +933,7 @@ class TensorSliceBlockMapper {
Dimensions m_block_dim_sizes;
Dimensions m_block_stride_order;
Dimensions m_block_strides;
- Index m_total_block_count;
+ StorageIndex m_total_block_count;
};
} // namespace internal
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index e647b3609..8fecbe657 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -110,6 +110,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
enum {
IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 21ffa2872..085c05f3d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -146,6 +146,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
// slice offsets.
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -343,6 +344,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
RawAccess = false
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index a7c1380b8..9f0321880 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -122,6 +122,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
RawAccess = false
};
@@ -306,6 +307,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
RawAccess = false
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 57b5339d1..86602c27e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -240,6 +240,7 @@ struct TensorContractionEvaluatorBase
enum {
IsAligned = true,
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+ BlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index 182bef918..e0cbbb315 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -195,6 +195,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = true,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 25131600d..1ec5819a7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -307,6 +307,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
enum {
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<InputArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -577,11 +578,11 @@ __global__ void EigenConvolutionKernel1D(
const float* __restrict kernel, const int numPlanes, const int numX,
const int maxX, const int kernelSize, float* buffer) {
#if defined(EIGEN_HIPCC)
- HIP_DYNAMIC_SHARED(float, s)
+ HIP_DYNAMIC_SHARED(float, s)
#else
extern __shared__ float s[];
#endif
-
+
const int first_x = blockIdx.x * maxX;
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
@@ -630,7 +631,7 @@ __global__ void EigenConvolutionKernel2D(
const int maxX, const int numY, const int maxY, const int kernelSizeX,
const int kernelSizeY, float* buffer) {
#if defined(EIGEN_HIPCC)
- HIP_DYNAMIC_SHARED(float, s)
+ HIP_DYNAMIC_SHARED(float, s)
#else
extern __shared__ float s[];
#endif
@@ -702,7 +703,7 @@ __global__ void EigenConvolutionKernel3D(
const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
const size_t kernelSizeZ, float* buffer) {
#if defined(EIGEN_HIPCC)
- HIP_DYNAMIC_SHARED(float, s)
+ HIP_DYNAMIC_SHARED(float, s)
#else
extern __shared__ float s[];
#endif
@@ -778,6 +779,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
enum {
IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
PacketAccess = false,
+ BlockAccess = false,
Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index 65403905a..d301d0c01 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -242,6 +242,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
enum {
IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
PacketAccess = false,
+ BlockAccess = false,
Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 192d4aa7b..5ca47cca7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -290,6 +290,22 @@ struct DSizes : array<DenseIndex, NumDims> {
}
}
+#ifndef EIGEN_EMULATE_CXX11_META_H
+ template <typename std::ptrdiff_t... Indices>
+ EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) {
+ for (int i = 0 ; i < NumDims; ++i) {
+ (*this)[i] = a[i];
+ }
+ }
+#else
+ template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
+ EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) {
+ for (int i = 0 ; i < NumDims; ++i) {
+ (*this)[i] = a[i];
+ }
+ }
+#endif
+
#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index d0c027890..af39daa91 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -107,6 +107,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index fe62ff1ea..f9a1bd68c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -41,11 +41,22 @@ struct TensorEvaluator
enum {
IsAligned = Derived::IsAligned,
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+ BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
Layout = Derived::Layout,
CoordAccess = NumCoords > 0,
RawAccess = true
};
+ typedef typename internal::TensorBlock<
+ typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+ TensorBlock;
+ typedef typename internal::TensorBlockReader<
+ typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+ TensorBlockReader;
+ typedef typename internal::TensorBlockWriter<
+ typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+ TensorBlockWriter;
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
: m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m)
{ }
@@ -113,6 +124,20 @@ struct TensorEvaluator
internal::unpacket_traits<PacketReturnType>::size);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
+ assert(m_data != NULL);
+ TensorBlockReader::Run(block, m_data);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+ const TensorBlock& block) {
+ assert(m_data != NULL);
+ TensorBlockWriter::Run(block, m_data);
+ }
+
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; }
/// required by sycl in order to construct sycl buffer from raw pointer
@@ -167,11 +192,19 @@ struct TensorEvaluator<const Derived, Device>
enum {
IsAligned = Derived::IsAligned,
PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+ BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
Layout = Derived::Layout,
CoordAccess = NumCoords > 0,
RawAccess = true
};
+ typedef typename internal::TensorBlock<
+ typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+ TensorBlock;
+ typedef typename internal::TensorBlockReader<
+ typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+ TensorBlockReader;
+
// Used for accessor extraction in SYCL Managed TensorMap:
const Derived& derived() const { return m_impl; }
@@ -219,6 +252,14 @@ struct TensorEvaluator<const Derived, Device>
internal::unpacket_traits<PacketReturnType>::size);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
+ assert(m_data != NULL);
+ TensorBlockReader::Run(block, m_data);
+ }
+
EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; }
/// added for sycl in order to construct the buffer from the sycl device
@@ -244,6 +285,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
enum {
IsAligned = true,
PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -308,7 +350,9 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
- PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
+ internal::functor_traits<UnaryOp>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -375,16 +419,21 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
enum {
- IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
- PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess &
+ IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned &
+ TensorEvaluator<RightArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
+ TensorEvaluator<RightArgType, Device>::PacketAccess &
internal::functor_traits<BinaryOp>::PacketAccess,
- Layout = TensorEvaluator<LeftArgType, Device>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = false
+ BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
+ TensorEvaluator<RightArgType, Device>::BlockAccess,
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
- : m_functor(op.functor()),
+ : m_device(device),
+ m_functor(op.functor()),
m_leftImpl(op.lhsExpression(), device),
m_rightImpl(op.rhsExpression(), device)
{
@@ -399,6 +448,14 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
+ static const int NumDims = internal::array_size<
+ typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
+
+ typedef internal::TensorBlock<
+ typename internal::remove_const<Scalar>::type, Index, NumDims,
+ TensorEvaluator<LeftArgType, Device>::Layout>
+ TensorBlock;
+
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{
// TODO: use right impl instead if right impl dimensions are known at compile time.
@@ -433,6 +490,30 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ m_leftImpl.getResourceRequirements(resources);
+ m_rightImpl.getResourceRequirements(resources);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+ TensorBlock* output_block) const {
+ if (NumDims <= 0) {
+ output_block->data()[0] = coeff(0);
+ return;
+ }
+ internal::TensorBlockView<LeftArgType, Device> left_block(
+ m_device, m_leftImpl, *output_block);
+ internal::TensorBlockView<RightArgType, Device> right_block(
+ m_device, m_rightImpl, *output_block);
+ internal::TensorBlockCwiseBinaryIO<
+ BinaryOp, Index, typename internal::remove_const<Scalar>::type, NumDims,
+ Layout>::Run(m_functor, output_block->block_sizes(),
+ output_block->block_strides(), output_block->data(),
+ left_block.block_strides(), left_block.data(),
+ right_block.block_strides(), right_block.data());
+ }
+
EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
/// required by sycl in order to extract the accessor
const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
@@ -442,6 +523,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
BinaryOp functor() const { return m_functor; }
private:
+ const Device& m_device;
const BinaryOp m_functor;
TensorEvaluator<LeftArgType, Device> m_leftImpl;
TensorEvaluator<RightArgType, Device> m_rightImpl;
@@ -458,6 +540,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
internal::functor_traits<TernaryOp>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<Arg1Type, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -562,6 +645,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
internal::packet_traits<Scalar>::HasBlend,
+ BlockAccess = false,
Layout = TensorEvaluator<IfArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 53640c6aa..ac5afd891 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -12,31 +12,40 @@
namespace Eigen {
-/** \class TensorExecutor
- * \ingroup CXX11_Tensor_Module
- *
- * \brief The tensor executor class.
- *
- * This class is responsible for launch the evaluation of the expression on
- * the specified computing device.
- */
+/**
+ * \class TensorExecutor
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor executor class.
+ *
+ * This class is responsible for launch the evaluation of the expression on
+ * the specified computing device.
+ *
+ * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
+ * instructions)
+ * @tparam Tileable can use block based tensor evaluation
+ * (see TensorBlock.h)
+ */
namespace internal {
-// Default strategy: the expression is evaluated with a single cpu thread.
-template<typename Expression, typename Device, bool Vectorizable>
-class TensorExecutor
-{
+/**
+ * Default strategy: the expression is evaluated sequentially with a single cpu
+ * thread, without vectorization and block evaluation.
+ */
+template <typename Expression, typename Device, bool Vectorizable,
+ bool Tileable>
+class TensorExecutor {
public:
- typedef typename Expression::Index Index;
+ using StorageIndex = typename Expression::Index;
+
EIGEN_DEVICE_FUNC
- static inline void run(const Expression& expr, const Device& device = Device())
- {
+ static inline void run(const Expression& expr,
+ const Device& device = Device()) {
TensorEvaluator<Expression, Device> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
- if (needs_assign)
- {
- const Index size = array_prod(evaluator.dimensions());
- for (Index i = 0; i < size; ++i) {
+ if (needs_assign) {
+ const StorageIndex size = array_prod(evaluator.dimensions());
+ for (StorageIndex i = 0; i < size; ++i) {
evaluator.evalScalar(i);
}
}
@@ -44,35 +53,40 @@ class TensorExecutor
}
};
-
-template<typename Expression>
-class TensorExecutor<Expression, DefaultDevice, true>
-{
+/**
+ * Process all the data with a single cpu thread, using vectorized instructions.
+ */
+template <typename Expression>
+class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
+ /*Tileable*/ false> {
public:
- typedef typename Expression::Index Index;
+ using StorageIndex = typename Expression::Index;
+
EIGEN_DEVICE_FUNC
- static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
- {
+ static inline void run(const Expression& expr,
+ const DefaultDevice& device = DefaultDevice()) {
TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
- if (needs_assign)
- {
- const Index size = array_prod(evaluator.dimensions());
- const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
- // Give the compiler a strong hint to unroll the loop. But don't insist
- // on unrolling, because if the function is expensive the compiler should not
+ if (needs_assign) {
+ const StorageIndex size = array_prod(evaluator.dimensions());
+ const int PacketSize = unpacket_traits<typename TensorEvaluator<
+ Expression, DefaultDevice>::PacketReturnType>::size;
+
+ // Give compiler a strong possibility to unroll the loop. But don't insist
+ // on unrolling, because if the function is expensive compiler should not
// unroll the loop at the expense of inlining.
- const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
- for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
- for (Index j = 0; j < 4; j++) {
+ const StorageIndex UnrolledSize =
+ (size / (4 * PacketSize)) * 4 * PacketSize;
+ for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
+ for (StorageIndex j = 0; j < 4; j++) {
evaluator.evalPacket(i + j * PacketSize);
}
}
- const Index VectorizedSize = (size / PacketSize) * PacketSize;
- for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
+ const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
+ for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
evaluator.evalPacket(i);
}
- for (Index i = VectorizedSize; i < size; ++i) {
+ for (StorageIndex i = VectorizedSize; i < size; ++i) {
evaluator.evalScalar(i);
}
}
@@ -80,41 +94,107 @@ class TensorExecutor<Expression, DefaultDevice, true>
}
};
+/**
+ * Process all the data with a single cpu thread, using blocks of data. By
+ * sizing a block to fit L1 cache we get better cache performance.
+ */
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, DefaultDevice, Vectorizable,
+ /*Tileable*/ true> {
+ public:
+ using Scalar = typename traits<Expression>::Scalar;
+ using ScalarNoConst = typename remove_const<Scalar>::type;
+
+ using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
+ using StorageIndex = typename traits<Expression>::Index;
+
+ static const int NumDims = traits<Expression>::NumDimensions;
+
+ EIGEN_DEVICE_FUNC
+ static inline void run(const Expression& expr,
+ const DefaultDevice& device = DefaultDevice()) {
+ using TensorBlock =
+ TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
+ using TensorBlockMapper = TensorBlockMapper<ScalarNoConst, StorageIndex,
+ NumDims, Evaluator::Layout>;
+ Evaluator evaluator(expr, device);
+ Index total_size = array_prod(evaluator.dimensions());
+ Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+
+ if (total_size < cache_size) {
+ // TODO(andydavis) Reduce block management overhead for small tensors.
+ // TODO(wuke) Do not do this when evaluating TensorBroadcastingOp.
+ internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
+ /*Tileable*/ false>::run(expr, device);
+ return;
+ }
-// Multicore strategy: the index space is partitioned and each partition is executed on a single core
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign) {
+ // Size tensor blocks to fit in cache (or requested target block size).
+ Index block_total_size = numext::mini(cache_size, total_size);
+ TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
+ // Query expression tree for desired block size/shape.
+ std::vector<TensorOpResourceRequirements> resources;
+ evaluator.getResourceRequirements(&resources);
+ MergeResourceRequirements(resources, &block_shape, &block_total_size);
+
+ TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
+ block_total_size);
+ block_total_size = block_mapper.block_dims_total_size();
+
+ Scalar* data = static_cast<Scalar*>(
+ device.allocate(block_total_size * sizeof(Scalar)));
+
+ const StorageIndex total_block_count = block_mapper.total_block_count();
+ for (StorageIndex i = 0; i < total_block_count; ++i) {
+ TensorBlock block = block_mapper.GetBlockForIndex(i, data);
+ evaluator.evalBlock(&block);
+ }
+ device.deallocate(data);
+ }
+ evaluator.cleanup();
+ }
+};
+
+/**
+ * Multicore strategy: the index space is partitioned and each partition is
+ * executed on a single core.
+ */
#ifdef EIGEN_USE_THREADS
-template <typename Evaluator, typename Index, bool Vectorizable>
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
struct EvalRange {
- static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+ static void run(Evaluator* evaluator_in, const StorageIndex first,
+ const StorageIndex last) {
Evaluator evaluator = *evaluator_in;
eigen_assert(last >= first);
- for (Index i = first; i < last; ++i) {
+ for (StorageIndex i = first; i < last; ++i) {
evaluator.evalScalar(i);
}
}
- static Index alignBlockSize(Index size) {
- return size;
- }
+ static StorageIndex alignBlockSize(StorageIndex size) { return size; }
};
-template <typename Evaluator, typename Index>
-struct EvalRange<Evaluator, Index, true> {
- static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+template <typename Evaluator, typename StorageIndex>
+struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
+ static const int PacketSize =
+ unpacket_traits<typename Evaluator::PacketReturnType>::size;
- static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+ static void run(Evaluator* evaluator_in, const StorageIndex first,
+ const StorageIndex last) {
Evaluator evaluator = *evaluator_in;
eigen_assert(last >= first);
- Index i = first;
+ StorageIndex i = first;
if (last - first >= PacketSize) {
eigen_assert(first % PacketSize == 0);
- Index last_chunk_offset = last - 4 * PacketSize;
- // Give the compiler a strong hint to unroll the loop. But don't insist
- // on unrolling, because if the function is expensive the compiler should not
+ StorageIndex last_chunk_offset = last - 4 * PacketSize;
+ // Give compiler a strong possibility to unroll the loop. But don't insist
+ // on unrolling, because if the function is expensive compiler should not
// unroll the loop at the expense of inlining.
- for (; i <= last_chunk_offset; i += 4*PacketSize) {
- for (Index j = 0; j < 4; j++) {
+ for (; i <= last_chunk_offset; i += 4 * PacketSize) {
+ for (StorageIndex j = 0; j < 4; j++) {
evaluator.evalPacket(i + j * PacketSize);
}
}
@@ -128,7 +208,7 @@ struct EvalRange<Evaluator, Index, true> {
}
}
- static Index alignBlockSize(Index size) {
+ static StorageIndex alignBlockSize(StorageIndex size) {
// Align block size to packet size and account for unrolling in run above.
if (size >= 16 * PacketSize) {
return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
@@ -138,106 +218,165 @@ struct EvalRange<Evaluator, Index, true> {
}
};
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
+template <typename Expression, bool Vectorizable, bool Tileable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
public:
- typedef typename Expression::Index Index;
- static inline void run(const Expression& expr, const ThreadPoolDevice& device)
- {
+ using StorageIndex = typename Expression::Index;
+
+ static inline void run(const Expression& expr,
+ const ThreadPoolDevice& device) {
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+ typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
+
Evaluator evaluator(expr, device);
- const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
- if (needs_assign)
- {
- const Index size = array_prod(evaluator.dimensions());
- size_t num_threads = device.numThreads();
- if (num_threads > 1) {
- num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
- size, evaluator.costPerCoeff(Vectorizable), num_threads);
- }
- if (num_threads == 1) {
- EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
- } else {
- const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
- Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
- const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
- const Index numblocks = size / blocksize;
-
- Barrier barrier(numblocks);
- for (int i = 0; i < numblocks; ++i) {
- device.enqueue_with_barrier(
- &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
- &evaluator, i * blocksize, (i + 1) * blocksize);
- }
- if (numblocks * blocksize < size) {
- EvalRange<Evaluator, Index, Vectorizable>::run(
- &evaluator, numblocks * blocksize, size);
- }
- barrier.Wait();
- }
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+ if (needs_assign) {
+ const StorageIndex PacketSize =
+ Vectorizable
+ ? unpacket_traits<typename Evaluator::PacketReturnType>::size
+ : 1;
+ const StorageIndex size = array_prod(evaluator.dimensions());
+ device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
+ EvalRange::alignBlockSize,
+ [&evaluator](StorageIndex first, StorageIndex last) {
+ EvalRange::run(&evaluator, first, last);
+ });
+ }
+ evaluator.cleanup();
+ }
+};
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
+ public:
+ using Scalar = typename traits<Expression>::Scalar;
+ using ScalarNoConst = typename remove_const<Scalar>::type;
+
+ using Evaluator = TensorEvaluator<Expression, ThreadPoolDevice>;
+ using StorageIndex = typename traits<Expression>::Index;
+
+ static const int NumDims = traits<Expression>::NumDimensions;
+
+ static inline void run(const Expression& expr,
+ const ThreadPoolDevice& device) {
+ using TensorBlock =
+ TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
+ using TensorBlockMapper =
+ TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
+
+ Evaluator evaluator(expr, device);
+ StorageIndex total_size = array_prod(evaluator.dimensions());
+ StorageIndex cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+ if (total_size < cache_size) {
+ // TODO(andydavis) Reduce block management overhead for small tensors.
+ internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
+ false>::run(expr, device);
+ evaluator.cleanup();
+ return;
+ }
+
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+ if (needs_assign) {
+ TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
+ Index block_total_size = 0;
+ // Query expression tree for desired block size/shape.
+ std::vector<internal::TensorOpResourceRequirements> resources;
+ evaluator.getResourceRequirements(&resources);
+ MergeResourceRequirements(resources, &block_shape, &block_total_size);
+ int num_threads = device.numThreads();
+
+ // Estimate minimum block size based on cost.
+ TensorOpCost cost = evaluator.costPerCoeff(Vectorizable);
+ double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost);
+ size_t block_size = static_cast<size_t>(1.0 / taskSize);
+ TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
+ block_size);
+ block_size = block_mapper.block_dims_total_size();
+ const size_t aligned_blocksize =
+ EIGEN_MAX_ALIGN_BYTES *
+ divup<size_t>(block_size * sizeof(Scalar), EIGEN_MAX_ALIGN_BYTES);
+ void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
+ device.parallelFor(
+ block_mapper.total_block_count(), cost * block_size,
+ [=, &device, &evaluator, &block_mapper](StorageIndex first,
+ StorageIndex last) {
+ // currentThreadId() returns -1 if called from a thread not in the
+ // thread pool, such as the main thread dispatching Eigen
+ // expressions.
+ const int thread_idx = device.currentThreadId();
+ eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
+ Scalar* thread_buf = reinterpret_cast<Scalar*>(
+ static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
+ for (StorageIndex i = first; i < last; ++i) {
+ auto block = block_mapper.GetBlockForIndex(i, thread_buf);
+ evaluator.evalBlock(&block);
+ }
+ });
+ device.deallocate(buf);
}
evaluator.cleanup();
}
};
+
#endif // EIGEN_USE_THREADS
// GPU: the evaluation of the expression is offloaded to a GPU.
#if defined(EIGEN_USE_GPU)
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, GpuDevice, Vectorizable> {
+template <typename Expression, bool Vectorizable, bool Tileable>
+class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
public:
- typedef typename Expression::Index Index;
+ typedef typename Expression::Index StorageIndex;
static void run(const Expression& expr, const GpuDevice& device);
};
#if defined(EIGEN_GPUCC)
-template <typename Evaluator, typename Index, bool Vectorizable>
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
struct EigenMetaKernelEval {
static __device__ EIGEN_ALWAYS_INLINE
- void run(Evaluator& eval, Index first, Index last, Index step_size) {
- for (Index i = first; i < last; i += step_size) {
+ void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
+ for (StorageIndex i = first; i < last; i += step_size) {
eval.evalScalar(i);
}
}
};
-template <typename Evaluator, typename Index>
-struct EigenMetaKernelEval<Evaluator, Index, true> {
+template <typename Evaluator, typename StorageIndex>
+struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
static __device__ EIGEN_ALWAYS_INLINE
- void run(Evaluator& eval, Index first, Index last, Index step_size) {
- const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
- const Index vectorized_size = (last / PacketSize) * PacketSize;
- const Index vectorized_step_size = step_size * PacketSize;
+ void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
+ const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+ const StorageIndex vectorized_size = (last / PacketSize) * PacketSize;
+ const StorageIndex vectorized_step_size = step_size * PacketSize;
// Use the vector path
- for (Index i = first * PacketSize; i < vectorized_size;
+ for (StorageIndex i = first * PacketSize; i < vectorized_size;
i += vectorized_step_size) {
eval.evalPacket(i);
}
- for (Index i = vectorized_size + first; i < last; i += step_size) {
+ for (StorageIndex i = vectorized_size + first; i < last; i += step_size) {
eval.evalScalar(i);
}
}
};
-template <typename Evaluator, typename Index>
+template <typename Evaluator, typename StorageIndex>
__global__ void
__launch_bounds__(1024)
-EigenMetaKernel(Evaluator eval, Index size) {
+EigenMetaKernel(Evaluator eval, StorageIndex size) {
- const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
- const Index step_size = blockDim.x * gridDim.x;
+ const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
+ const StorageIndex step_size = blockDim.x * gridDim.x;
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
- EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
+ EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
}
/*static*/
-template <typename Expression, bool Vectorizable>
-inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
+template <typename Expression, bool Vectorizable, bool Tileable>
+inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
const Expression& expr, const GpuDevice& device) {
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
@@ -246,12 +385,12 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
const int block_size = device.maxGpuThreadsPerBlock();
const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / block_size;
- const Index size = array_prod(evaluator.dimensions());
+ const StorageIndex size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
LAUNCH_GPU_KERNEL(
- (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
+ (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
num_blocks, block_size, 0, device, evaluator, size);
}
evaluator.cleanup();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index e943757ad..1342e47a6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -40,6 +40,8 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
enum {
IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
+ PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+ BlockAccess = false,
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
CoordAccess = true,
RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index b8f0bc798..fdb31928f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -98,6 +98,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
enum {
IsAligned = true,
PacketAccess = (PacketSize > 1),
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = true
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 19e456e19..8ed1796df 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -129,8 +129,14 @@ struct IsVectorizable<GpuDevice, Expression> {
TensorEvaluator<Expression, GpuDevice>::IsAligned;
};
+template <typename Device, typename Expression>
+struct IsTileable {
+ static const bool value = TensorEvaluator<Expression, Device>::BlockAccess;
+};
+
template <typename Expression, typename Device,
- bool Vectorizable = IsVectorizable<Device, Expression>::value>
+ bool Vectorizable = IsVectorizable<Device, Expression>::value,
+ bool Tileable = IsTileable<Device, Expression>::value>
class TensorExecutor;
} // end namespace internal
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index f0f7c7826..72cb2d15f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -186,6 +186,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index 4e384f9b9..e3165fa10 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -119,6 +119,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
CoordAccess = false, // to be implemented
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@@ -181,6 +182,7 @@ template<typename ArgType, typename Device>
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
CoordAccess = false // to be implemented
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index cda49f8fe..498488649 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -105,6 +105,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@@ -170,6 +171,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@@ -325,6 +327,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
// slice offsets and sizes.
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
@@ -557,6 +560,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
@@ -716,7 +720,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
static const int NumDims = internal::array_size<Strides>::value;
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
- typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef Strides Dimensions;
@@ -858,7 +861,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
}
return inputIndex;
}
-
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
#ifndef __SYCL_DEVICE_ONLY__
return numext::maxi(min, numext::mini(max,value));
@@ -907,7 +910,6 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
- typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef Strides Dimensions;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 5956e513d..ffa22f31e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -96,6 +96,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
enum {
IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = true,
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 9e0a20abf..950ac32af 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -94,6 +94,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index ce573d730..375fc0802 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -412,6 +412,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
enum {
IsAligned = false,
PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index b2b4fd8d3..a6cade50f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -136,6 +136,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
enum {
IsAligned = false,
PacketAccess = false,
+ BlockAccess = false,
Layout = PlainObjectType::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -364,6 +365,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
enum {
IsAligned = false,
PacketAccess = false,
+ BlockAccess = false,
Layout = TensorRef<Derived>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -411,6 +413,7 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
enum {
IsAligned = false,
PacketAccess = false,
+ BlockAccess = false,
RawAccess = false
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 14a50a029..bb2768ab1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -113,6 +113,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -253,6 +254,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 0697fd1ce..6b54f40ad 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -240,6 +241,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+ BlockAccess = false,
RawAccess = false
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index a7eea99b6..c09513c10 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
@@ -273,6 +274,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
index 2b1968de1..c8b2fad1e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
@@ -110,7 +111,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
for (int i = 0; i < NumInputDims; ++i) {
m_reduced[i] = false;
}
-
+
const Dims& op_dims = op.dims();
for (int i = 0; i < NumReducedDims; ++i) {
eigen_assert(op_dims[i] >= 0);
@@ -128,7 +129,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
eigen_assert(num_distinct_reduce_dims == NumReducedDims);
- // Compute the dimensions of the result.
+ // Compute the dimensions of the result.
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
int output_index = 0;
@@ -229,7 +230,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
result += m_impl.coeff(cur_index);
cur_index += index_stride;
}
-
+
return result;
}