aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Eugene Zhulenev <ezhulenev@google.com>2019-11-12 10:12:28 -0800
committerGravatar Eugene Zhulenev <ezhulenev@google.com>2019-11-12 10:12:28 -0800
commit13c3327f5cf829fd9d04a2ab46861e722cd74ca0 (patch)
tree20bd1a5f361023db822298696efbcff7378ab4a7
parent71aa53dd6dfdc497324d9e87f59c4ba820191856 (diff)
Remove legacy block evaluation support
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h15
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h907
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h274
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h68
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h16
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h72
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h202
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h11
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h9
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h55
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h134
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h178
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h258
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRef.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h108
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorScan.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h114
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h1
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h1
-rw-r--r--unsupported/test/cxx11_tensor_block_access.cpp650
-rw-r--r--unsupported/test/cxx11_tensor_executor.cpp212
36 files changed, 74 insertions, 3236 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index f2a5d86fe..68bfd141a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -88,7 +88,6 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -230,7 +229,6 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
index e6d8e7f91..2184c94b3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
@@ -108,7 +108,6 @@ struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, Sy
enum {
IsAligned = false,
PacketAccess = false,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 1f64de3a9..d7795a00d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -108,8 +108,6 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
TensorEvaluator<RightArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
TensorEvaluator<RightArgType, Device>::PacketAccess,
- BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
- TensorEvaluator<RightArgType, Device>::BlockAccess,
BlockAccessV2 = TensorEvaluator<LeftArgType, Device>::BlockAccessV2 &
TensorEvaluator<RightArgType, Device>::BlockAccessV2,
PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
@@ -216,19 +214,6 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
m_rightImpl.getResourceRequirements(resources);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
- if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
- m_leftImpl.data() != NULL) {
- TensorBlock left_block(block->first_coeff_index(), block->block_sizes(),
- block->tensor_strides(), block->tensor_strides(),
- m_leftImpl.data() + block->first_coeff_index());
- m_rightImpl.block(&left_block);
- } else {
- m_rightImpl.block(block);
- m_leftImpl.writeBlock(*block);
- }
- }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2(
TensorBlockDesc& desc, TensorBlockScratch& scratch) {
if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index a8e7a8d7b..447da9121 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -142,782 +142,6 @@ class TensorBlock {
Scalar* m_data; // Not owned.
};
-template <typename Scalar, typename StorageIndex>
-struct TensorBlockCopyOp {
-
- typedef typename packet_traits<Scalar>::type Packet;
- enum {
- Vectorizable = packet_traits<Scalar>::Vectorizable,
- PacketSize = packet_traits<Scalar>::size
- };
-
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
- const StorageIndex num_coeff_to_copy, const StorageIndex dst_index,
- const StorageIndex dst_stride, Scalar* EIGEN_RESTRICT dst_data,
- const StorageIndex src_index, const StorageIndex src_stride,
- const Scalar* EIGEN_RESTRICT src_data) {
- const Scalar* src = &src_data[src_index];
- Scalar* dst = &dst_data[dst_index];
-
- if (!Vectorizable) {
- for (Index i = 0; i < num_coeff_to_copy; ++i) {
- dst[i * dst_stride] = src[i * src_stride];
- }
- return;
- }
-
- if (src_stride == 1) {
- const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
- if (dst_stride == 1) {
- // LINEAR
- for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
- Packet p = ploadu<Packet>(src + i);
- pstoreu<Scalar, Packet>(dst + i, p);
- }
- for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
- dst[i] = src[i];
- }
- } else {
- // SCATTER
- for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
- Packet p = ploadu<Packet>(src + i);
- pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
- }
- for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
- dst[i * dst_stride] = src[i];
- }
- }
- } else if (src_stride == 0) {
- const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
- if (dst_stride == 1) {
- // LINEAR
- for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
- Packet p = pload1<Packet>(src);
- pstoreu<Scalar, Packet>(dst + i, p);
- }
- for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
- dst[i] = *src;
- }
- } else {
- // SCATTER
- for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
- Packet p = pload1<Packet>(src);
- pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
- }
- for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
- dst[i * dst_stride] = *src;
- }
- }
- } else {
- if (dst_stride == 1) {
- // GATHER
- const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
- for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
- Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
- pstoreu<Scalar, Packet>(dst + i, p);
- }
- for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
- dst[i] = src[i * src_stride];
- }
- } else {
- // RANDOM
- for (StorageIndex i = 0; i < num_coeff_to_copy; ++i) {
- dst[i * dst_stride] = src[i * src_stride];
- }
- }
- }
- }
-};
-
-/**
- * \class TensorBlockIO
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor block IO class.
- *
- * This class is responsible for copying data between a tensor and a tensor
- * block.
- */
-template <typename Scalar, typename StorageIndex, int NumDims, int Layout,
- bool BlockRead>
-class TensorBlockIO {
- public:
- typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
- typedef TensorBlockCopyOp<Scalar, StorageIndex> BlockCopyOp;
-
- protected:
- typedef array<StorageIndex, NumDims> Dimensions;
-
- struct BlockIteratorState {
- StorageIndex input_stride;
- StorageIndex output_stride;
- StorageIndex input_span;
- StorageIndex output_span;
- StorageIndex size;
- StorageIndex count;
- BlockIteratorState()
- : input_stride(0),
- output_stride(0),
- input_span(0),
- output_span(0),
- size(0),
- count(0) {}
- };
-
- // Compute how many inner dimensions it's allowed to squeeze when doing IO
- // between a tensor and a block. It's safe to squeeze inner dimensions, only
- // if they are not reordered.
- static int NumSqueezableInnerDims(const Dimensions& tensor_to_block_dim_map) {
- int num_squeezable_dims = 0;
- if (Layout == ColMajor) {
- for (int i = 0; i < NumDims; ++i) {
- if (tensor_to_block_dim_map[i] == i) num_squeezable_dims++;
- else break;
- }
- } else {
- for (int i = NumDims - 1; i >= 0; --i) {
- if (tensor_to_block_dim_map[i] == i) num_squeezable_dims++;
- else break;
- }
- }
- return num_squeezable_dims;
- }
-
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
- const Block& block, StorageIndex first_coeff_index,
- const Dimensions& tensor_to_block_dim_map,
- const Dimensions& tensor_strides,
- const Scalar* src_data,
- Scalar* dst_data) {
- // Do not squeeze reordered inner dimensions.
- int num_squeezable_dims = NumSqueezableInnerDims(tensor_to_block_dim_map);
-
- // Find the innermost tensor dimension whose size is not 1. This is the
- // effective inner dim. If all dimensions are of size 1, then fallback to
- // using the actual innermost dim to avoid out-of-bound access.
- StorageIndex num_size_one_inner_dims = 0;
- for (int i = 0; i < num_squeezable_dims; ++i) {
- const int dim = cond<Layout>()(i, NumDims - i - 1);
- if (block.block_sizes()[tensor_to_block_dim_map[dim]] != 1) {
- num_size_one_inner_dims = i;
- break;
- }
- }
-
- // Calculate strides and dimensions.
- const StorageIndex tensor_stride1_dim = cond<Layout>()(
- num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1);
- const StorageIndex block_dim_for_tensor_stride1_dim =
- NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim];
- StorageIndex block_inner_dim_size =
- NumDims == 0 ? 1
- : block.block_sizes()[block_dim_for_tensor_stride1_dim];
-
- // Squeeze multiple inner dims into one for larger inner dim size.
- for (Index i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
- const Index dim = cond<Layout>()(i, NumDims - i - 1);
- const StorageIndex block_stride =
- block.block_strides()[tensor_to_block_dim_map[dim]];
- if (block_inner_dim_size == block_stride &&
- block_stride == tensor_strides[dim]) {
- block_inner_dim_size *=
- block.block_sizes()[tensor_to_block_dim_map[dim]];
- ++num_size_one_inner_dims;
- } else {
- break;
- }
- }
-
- StorageIndex inputIndex;
- StorageIndex outputIndex;
- StorageIndex input_stride;
- StorageIndex output_stride;
-
- // Setup strides to read/write along the tensor's stride1 dimension.
- if (BlockRead) {
- inputIndex = first_coeff_index;
- outputIndex = 0;
- input_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim];
- output_stride =
- NumDims == 0
- ? 1
- : block.block_strides()[block_dim_for_tensor_stride1_dim];
- } else {
- inputIndex = 0;
- outputIndex = first_coeff_index;
- input_stride =
- NumDims == 0
- ? 1
- : block.block_strides()[block_dim_for_tensor_stride1_dim];
- output_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim];
- }
-
- const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
- array<BlockIteratorState, at_least_1_dim> block_iter_state;
-
- // Initialize block iterator state. Squeeze away any dimension of size 1.
- Index num_squeezed_dims = 0;
- for (Index i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
- const Index dim = cond<Layout>()(i + 1, NumDims - i - 2);
- const StorageIndex size = block.block_sizes()[tensor_to_block_dim_map[dim]];
- if (size == 1) {
- continue;
- }
- block_iter_state[num_squeezed_dims].size = size;
- if (BlockRead) {
- block_iter_state[num_squeezed_dims].input_stride = tensor_strides[dim];
- block_iter_state[num_squeezed_dims].output_stride =
- block.block_strides()[tensor_to_block_dim_map[dim]];
- } else {
- block_iter_state[num_squeezed_dims].input_stride =
- block.block_strides()[tensor_to_block_dim_map[dim]];
- block_iter_state[num_squeezed_dims].output_stride = tensor_strides[dim];
- }
- block_iter_state[num_squeezed_dims].input_span =
- block_iter_state[num_squeezed_dims].input_stride *
- (block_iter_state[num_squeezed_dims].size - 1);
- block_iter_state[num_squeezed_dims].output_span =
- block_iter_state[num_squeezed_dims].output_stride *
- (block_iter_state[num_squeezed_dims].size - 1);
- ++num_squeezed_dims;
- }
-
- // Iterate copying data from src to dst.
- const StorageIndex block_total_size =
- NumDims == 0 ? 1 : block.block_sizes().TotalSize();
- for (StorageIndex i = 0; i < block_total_size; i += block_inner_dim_size) {
- BlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride,
- dst_data, inputIndex, input_stride, src_data);
- // Update index.
- for (int j = 0; j < num_squeezed_dims; ++j) {
- if (++block_iter_state[j].count < block_iter_state[j].size) {
- inputIndex += block_iter_state[j].input_stride;
- outputIndex += block_iter_state[j].output_stride;
- break;
- }
- block_iter_state[j].count = 0;
- inputIndex -= block_iter_state[j].input_span;
- outputIndex -= block_iter_state[j].output_span;
- }
- }
- }
-};
-
-/**
- * \class TensorBlockReader
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor block reader class.
- *
- * This class is responsible for reading a tensor block.
- *
- */
-template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
-class TensorBlockReader : public TensorBlockIO<Scalar, StorageIndex, NumDims,
- Layout, /*BlockRead=*/true> {
- public:
- typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
- typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/true> Base;
-
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
- Block* block, const Scalar* src_data) {
- array<StorageIndex, NumDims> tensor_to_block_dim_map;
- for (int i = 0; i < NumDims; ++i) {
- tensor_to_block_dim_map[i] = i;
- }
- Base::Copy(*block, block->first_coeff_index(), tensor_to_block_dim_map,
- block->tensor_strides(), src_data, block->data());
- }
-
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
- Block* block, StorageIndex first_coeff_index,
- const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
- const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data) {
- Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map,
- tensor_strides, src_data, block->data());
- }
-};
-
-/**
- * \class TensorBlockWriter
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor block writer class.
- *
- * This class is responsible for writing a tensor block.
- *
- */
-template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
-class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims,
- Layout, /*BlockRead=*/false> {
- public:
- typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
- typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/false> Base;
-
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
- const Block& block, Scalar* dst_data) {
- array<StorageIndex, NumDims> tensor_to_block_dim_map;
- for (int i = 0; i < NumDims; ++i) {
- tensor_to_block_dim_map[i] = i;
- }
- Base::Copy(block, block.first_coeff_index(), tensor_to_block_dim_map,
- block.tensor_strides(), block.data(), dst_data);
- }
-
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
- const Block& block, StorageIndex first_coeff_index,
- const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
- const array<StorageIndex, NumDims>& tensor_strides, Scalar* dst_data) {
- Base::Copy(block, first_coeff_index, tensor_to_block_dim_map,
- tensor_strides, block.data(), dst_data);
- }
-};
-
-/**
- * \class TensorBlockCwiseUnaryOp
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Carries out a cwise binary op on a number of coefficients.
- *
- * This class reads strided input from the argument, and writes the
- * result of the cwise unary op to the strided output array.
- *
- */
-template <bool Vectorizable>
-struct TensorBlockCwiseUnaryOp {
- template <typename StorageIndex, typename UnaryFunctor,
- typename OutputScalar, typename InputScalar>
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
- const UnaryFunctor& functor, const StorageIndex num_coeff,
- const StorageIndex output_index, const StorageIndex output_stride,
- OutputScalar* output_data, const StorageIndex input_index,
- const StorageIndex input_stride, const InputScalar* input_data) {
- typedef const Array<InputScalar, Dynamic, 1> Input;
- typedef Array<OutputScalar, Dynamic, 1> Output;
-
- typedef Map<Input, 0, InnerStride<> > InputMap;
- typedef Map<Output, 0, InnerStride<> > OutputMap;
-
- const InputScalar* input_base = &input_data[input_index];
- OutputScalar* output_base = &output_data[output_index];
-
- const InputMap input(input_base, num_coeff, InnerStride<>(input_stride));
- OutputMap output(output_base, num_coeff, InnerStride<>(output_stride));
-
- output = CwiseUnaryOp<UnaryFunctor, InputMap>(input, functor);
- }
-};
-
-template<>
-struct TensorBlockCwiseUnaryOp<true> {
- template <typename StorageIndex, typename UnaryFunctor,
- typename OutputScalar, typename InputScalar>
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
- const UnaryFunctor& functor, const StorageIndex num_coeff,
- const StorageIndex output_index, const StorageIndex output_stride,
- OutputScalar* output_data, const StorageIndex input_index,
- const StorageIndex input_stride, const InputScalar* input_data) {
- if (input_stride == 1 && output_stride == 1) {
- typedef const Array<InputScalar, Dynamic, 1> Input;
- typedef Array<OutputScalar, Dynamic, 1> Output;
-
- const Map<Input> input(&input_data[input_index], num_coeff);
- Map<Output> output(&output_data[output_index], num_coeff);
-
- output = CwiseUnaryOp<UnaryFunctor, Map<Input> >(input, functor);
- } else {
- TensorBlockCwiseUnaryOp<false>::Run(
- functor, num_coeff, output_index, output_stride, output_data,
- input_index, input_stride, input_data);
- }
- }
-};
-
-/**
- * \class TensorBlockCwiseUnaryIO
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor block IO class for carrying out cwise unary ops.
- *
- * This class carries out the unary op on given blocks.
- */
-template <typename UnaryFunctor, typename StorageIndex, typename OutputScalar,
- int NumDims, int Layout>
-struct TensorBlockCwiseUnaryIO {
- typedef typename TensorBlock<OutputScalar, StorageIndex, NumDims,
- Layout>::Dimensions Dimensions;
-
- typedef TensorBlockCwiseUnaryOp<
- packet_traits<OutputScalar>::Vectorizable &&
- functor_traits<UnaryFunctor>::PacketAccess>
- TensorBlockCwiseUnaryOpImpl;
-
- struct BlockIteratorState {
- StorageIndex output_stride, output_span;
- StorageIndex input_stride, input_span;
- StorageIndex size, count;
- };
-
- template <typename InputScalar>
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
- const UnaryFunctor& functor, const Dimensions& block_sizes,
- const Dimensions& block_strides, OutputScalar* output_data,
- const array<StorageIndex, NumDims>& input_strides,
- const InputScalar* input_data) {
- // Find the innermost dimension whose size is not 1. This is the effective
- // inner dim. If all dimensions are of size 1, fallback to using the actual
- // innermost dim to avoid out-of-bound access.
- int num_size_one_inner_dims = 0;
- for (int i = 0; i < NumDims; ++i) {
- const int dim = cond<Layout>()(i, NumDims - i - 1);
- if (block_sizes[dim] != 1) {
- num_size_one_inner_dims = i;
- break;
- }
- }
- // Calculate strides and dimensions.
- const int inner_dim =
- NumDims == 0 ? 1
- : cond<Layout>()(num_size_one_inner_dims,
- NumDims - num_size_one_inner_dims - 1);
- StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
- for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
- const int dim = cond<Layout>()(i, NumDims - i - 1);
- // Merge multiple inner dims into one for larger inner dim size (i.e.
- // fewer calls to TensorBlockCwiseUnaryOp::Run()).
- if (inner_dim_size == block_strides[dim] &&
- block_strides[dim] == input_strides[dim]) {
- inner_dim_size *= block_sizes[dim];
- ++num_size_one_inner_dims;
- } else {
- break;
- }
- }
-
- StorageIndex output_index = 0, input_index = 0;
-
- const StorageIndex output_stride =
- NumDims == 0 ? 1 : block_strides[inner_dim];
- const StorageIndex input_stride =
- NumDims == 0 ? 1 : input_strides[inner_dim];
-
- const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
- array<BlockIteratorState, at_least_1_dim> block_iter_state;
-
- // Initialize block iterator state. Squeeze away any dimension of size 1.
- int num_squeezed_dims = 0;
- for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
- const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
- const StorageIndex size = block_sizes[dim];
- if (size == 1) {
- continue;
- }
- BlockIteratorState& state = block_iter_state[num_squeezed_dims];
- state.output_stride = block_strides[dim];
- state.input_stride = input_strides[dim];
- state.size = size;
- state.output_span = state.output_stride * (size - 1);
- state.input_span = state.input_stride * (size - 1);
- state.count = 0;
- ++num_squeezed_dims;
- }
-
- // Compute cwise unary op.
- const StorageIndex block_total_size =
- NumDims == 0 ? 1 : block_sizes.TotalSize();
- for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
- TensorBlockCwiseUnaryOpImpl::Run(functor, inner_dim_size, output_index,
- output_stride, output_data, input_index,
- input_stride, input_data);
- // Update index.
- for (int j = 0; j < num_squeezed_dims; ++j) {
- BlockIteratorState& state = block_iter_state[j];
- if (++state.count < state.size) {
- output_index += state.output_stride;
- input_index += state.input_stride;
- break;
- }
- state.count = 0;
- output_index -= state.output_span;
- input_index -= state.input_span;
- }
- }
- }
-};
-
-/**
- * \class TensorBlockCwiseBinaryOp
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Carries out a cwise binary op on a number of coefficients.
- *
- * This class reads strided inputs from left and right operands, and writes the
- * result of the cwise binary op to the strided output array.
- *
- */
-template<bool Vectorizable>
-struct TensorBlockCwiseBinaryOp {
- template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
- typename LeftScalar, typename RightScalar>
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
- const BinaryFunctor& functor, const StorageIndex num_coeff,
- const StorageIndex output_index, const StorageIndex output_stride,
- OutputScalar* output_data, const StorageIndex left_index,
- const StorageIndex left_stride, const LeftScalar* left_data,
- const StorageIndex right_index, const StorageIndex right_stride,
- const RightScalar* right_data) {
- typedef const Array<LeftScalar, Dynamic, 1> Lhs;
- typedef const Array<RightScalar, Dynamic, 1> Rhs;
- typedef Array<OutputScalar, Dynamic, 1> Out;
-
- typedef Map<Lhs, 0, InnerStride<> > LhsMap;
- typedef Map<Rhs, 0, InnerStride<> > RhsMap;
- typedef Map<Out, 0, InnerStride<> > OutMap;
-
- const LeftScalar* lhs_base = &left_data[left_index];
- const RightScalar* rhs_base = &right_data[right_index];
- OutputScalar* out_base = &output_data[output_index];
-
- const LhsMap lhs(lhs_base, num_coeff, InnerStride<>(left_stride));
- const RhsMap rhs(rhs_base, num_coeff, InnerStride<>(right_stride));
- OutMap out(out_base, num_coeff, InnerStride<>(output_stride));
-
- out = CwiseBinaryOp<BinaryFunctor, LhsMap, RhsMap>(lhs, rhs, functor);
- }
-};
-
-template<>
-struct TensorBlockCwiseBinaryOp<true> {
- template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
- typename LeftScalar, typename RightScalar>
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
- const BinaryFunctor& functor, const StorageIndex num_coeff,
- const StorageIndex output_index, const StorageIndex output_stride,
- OutputScalar* output_data, const StorageIndex left_index,
- const StorageIndex left_stride, const LeftScalar* left_data,
- const StorageIndex right_index, const StorageIndex right_stride,
- const RightScalar* right_data) {
- if (left_stride == 1 && right_stride == 1 && output_stride == 1) {
- typedef const Array<LeftScalar, Dynamic, 1> Lhs;
- typedef const Array<RightScalar, Dynamic, 1> Rhs;
- typedef Array<OutputScalar, Dynamic, 1> Out;
-
- const LeftScalar* lhs_base = &left_data[left_index];
- const RightScalar* rhs_base = &right_data[right_index];
- OutputScalar* out_base = &output_data[output_index];
-
- const Map<Lhs> lhs(lhs_base, num_coeff);
- const Map<Rhs> rhs(rhs_base, num_coeff);
- Map<Out> out(out_base, num_coeff);
-
- out = CwiseBinaryOp<BinaryFunctor, Map<Lhs>, Map<Rhs> >(lhs, rhs, functor);
- } else {
- TensorBlockCwiseBinaryOp<false>::Run(
- functor, num_coeff, output_index, output_stride, output_data,
- left_index, left_stride, left_data, right_index, right_stride,
- right_data);
- }
- }
-};
-
-/**
- * \class TensorBlockCwiseBinaryIO
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor block IO class for carrying out cwise binary ops.
- *
- * This class carries out the binary op on given blocks.
- *
- */
-template <typename BinaryFunctor, typename StorageIndex, typename OutputScalar,
- int NumDims, int Layout>
-struct TensorBlockCwiseBinaryIO {
- typedef typename TensorBlock<OutputScalar, StorageIndex, NumDims, Layout>::Dimensions Dimensions;
-
- typedef TensorBlockCwiseBinaryOp<
- packet_traits<OutputScalar>::Vectorizable &&
- functor_traits<BinaryFunctor>::PacketAccess>
- TensorBlockCwiseBinaryOpImpl;
-
- struct BlockIteratorState {
- StorageIndex output_stride, output_span;
- StorageIndex left_stride, left_span;
- StorageIndex right_stride, right_span;
- StorageIndex size, count;
- };
-
- template <typename LeftScalar, typename RightScalar>
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
- const BinaryFunctor& functor, const Dimensions& block_sizes,
- const Dimensions& block_strides, OutputScalar* output_data,
- const array<StorageIndex, NumDims>& left_strides,
- const LeftScalar* left_data,
- const array<StorageIndex, NumDims>& right_strides,
- const RightScalar* right_data) {
- // Find the innermost dimension whose size is not 1. This is the effective
- // inner dim. If all dimensions are of size 1, fallback to using the actual
- // innermost dim to avoid out-of-bound access.
- int num_size_one_inner_dims = 0;
- for (int i = 0; i < NumDims; ++i) {
- const int dim = cond<Layout>()(i, NumDims - i - 1);
- if (block_sizes[dim] != 1) {
- num_size_one_inner_dims = i;
- break;
- }
- }
- // Calculate strides and dimensions.
- const int inner_dim =
- NumDims == 0 ? 1
- : cond<Layout>()(num_size_one_inner_dims,
- NumDims - num_size_one_inner_dims - 1);
- StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
- for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
- const int dim = cond<Layout>()(i, NumDims - i - 1);
- // Merge multiple inner dims into one for larger inner dim size (i.e.
- // fewer calls to TensorBlockCwiseBinaryOp::Run()).
- if (inner_dim_size == block_strides[dim] &&
- block_strides[dim] == left_strides[dim] &&
- block_strides[dim] == right_strides[dim]) {
- inner_dim_size *= block_sizes[dim];
- ++num_size_one_inner_dims;
- } else {
- break;
- }
- }
-
- StorageIndex output_index = 0, left_index = 0, right_index = 0;
- const StorageIndex output_stride =
- NumDims == 0 ? 1 : block_strides[inner_dim];
- const StorageIndex left_stride = NumDims == 0 ? 1 : left_strides[inner_dim];
- const StorageIndex right_stride =
- NumDims == 0 ? 1 : right_strides[inner_dim];
-
- const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
- array<BlockIteratorState, at_least_1_dim> block_iter_state;
-
- // Initialize block iterator state. Squeeze away any dimension of size 1.
- int num_squeezed_dims = 0;
- for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
- const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
- const StorageIndex size = block_sizes[dim];
- if (size == 1) {
- continue;
- }
- BlockIteratorState& state = block_iter_state[num_squeezed_dims];
- state.output_stride = block_strides[dim];
- state.left_stride = left_strides[dim];
- state.right_stride = right_strides[dim];
- state.size = size;
- state.output_span = state.output_stride * (size - 1);
- state.left_span = state.left_stride * (size - 1);
- state.right_span = state.right_stride * (size - 1);
- state.count = 0;
- ++num_squeezed_dims;
- }
-
- // Compute cwise binary op.
- const StorageIndex block_total_size =
- NumDims == 0 ? 1 : block_sizes.TotalSize();
- for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
- TensorBlockCwiseBinaryOpImpl::Run(functor, inner_dim_size, output_index,
- output_stride, output_data, left_index,
- left_stride, left_data, right_index,
- right_stride, right_data);
- // Update index.
- for (int j = 0; j < num_squeezed_dims; ++j) {
- BlockIteratorState& state = block_iter_state[j];
- if (++state.count < state.size) {
- output_index += state.output_stride;
- left_index += state.left_stride;
- right_index += state.right_stride;
- break;
- }
- state.count = 0;
- output_index -= state.output_span;
- left_index -= state.left_span;
- right_index -= state.right_span;
- }
- }
- }
-};
-
-/**
- * \class TensorBlockView
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Read-only view into a block of data.
- *
- * This class provides read-only access to a block of data in impl. It may need
- * to allocate space for holding the intermediate result.
- *
- */
-template <class ArgType, class Device>
-struct TensorBlockView {
- typedef TensorEvaluator<ArgType, Device> Impl;
- typedef typename Impl::Index StorageIndex;
- typedef typename remove_const<typename Impl::Scalar>::type Scalar;
- static const int NumDims = array_size<typename Impl::Dimensions>::value;
- typedef DSizes<StorageIndex, NumDims> Dimensions;
-
- // Constructs a TensorBlockView for `impl`. `block` is only used for for
- // specifying the start offset, shape, and strides of the block.
- template <typename OtherTensorBlock>
- TensorBlockView(const Device& device,
- const TensorEvaluator<ArgType, Device>& impl,
- const OtherTensorBlock& block)
- : m_device(device),
- m_block_sizes(block.block_sizes()),
- m_data(NULL),
- m_allocated_data(NULL) {
- if (Impl::RawAccess && impl.data() != NULL) {
- m_data = impl.data() + block.first_coeff_index();
- m_block_strides = block.tensor_strides();
- } else {
- // Actually make a copy.
-
- // TODO(wuke): This sometimes put a lot pressure on the heap allocator.
- // Consider allowing ops to request additional temporary block memory in
- // TensorOpResourceRequirements.
- m_allocated_data = static_cast<Scalar*>(
- m_device.allocate(m_block_sizes.TotalSize() * sizeof(Scalar)));
- m_data = m_allocated_data;
- if (NumDims > 0) {
- if (static_cast<int>(Impl::Layout) == static_cast<int>(ColMajor)) {
- m_block_strides[0] = 1;
- for (int i = 1; i < NumDims; ++i) {
- m_block_strides[i] = m_block_strides[i - 1] * m_block_sizes[i - 1];
- }
- } else {
- m_block_strides[NumDims - 1] = 1;
- for (int i = NumDims - 2; i >= 0; --i) {
- m_block_strides[i] = m_block_strides[i + 1] * m_block_sizes[i + 1];
- }
- }
- }
- TensorBlock<Scalar, StorageIndex, NumDims, Impl::Layout> input_block(
- block.first_coeff_index(), m_block_sizes, m_block_strides,
- block.tensor_strides(), m_allocated_data);
- impl.block(&input_block);
- }
- }
-
- ~TensorBlockView() {
- if (m_allocated_data != NULL) {
- m_device.deallocate(m_allocated_data);
- }
- }
-
- const Dimensions& block_sizes() const { return m_block_sizes; }
- const Dimensions& block_strides() const { return m_block_strides; }
- const Scalar* data() const { return m_data; }
-
- private:
- const Device EIGEN_DEVICE_REF m_device;
- Dimensions m_block_sizes, m_block_strides;
- const Scalar* m_data; // Not owned.
- Scalar* m_allocated_data; // Owned.
-};
-
/**
* \class TensorBlockMapper
* \ingroup CXX11_Tensor_Module
@@ -1108,137 +332,6 @@ class TensorBlockMapper {
StorageIndex m_total_block_count;
};
-/**
- * \class TensorSliceBlockMapper
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor slice block mapper class.
- *
- * This class is responsible for iterating over the blocks of
- * a slice of a tensor. Supports shuffling of the block strides
- * for callers that want to reduce strides for dimensions to be
- * processed together.
- *
- */
-template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
-class TensorSliceBlockMapper {
- public:
- typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
- typedef DSizes<StorageIndex, NumDims> Dimensions;
-
- TensorSliceBlockMapper(const Dimensions& tensor_dims,
- const Dimensions& tensor_slice_offsets,
- const Dimensions& tensor_slice_extents,
- const Dimensions& block_dim_sizes,
- const Dimensions& block_stride_order)
- : m_tensor_dimensions(tensor_dims),
- m_tensor_slice_offsets(tensor_slice_offsets),
- m_tensor_slice_extents(tensor_slice_extents),
- m_block_dim_sizes(block_dim_sizes),
- m_block_stride_order(block_stride_order),
- m_total_block_count(1) {
- // Calculate block counts by dimension and total block count.
- DSizes<StorageIndex, NumDims> block_count;
- for (Index i = 0; i < block_count.rank(); ++i) {
- block_count[i] = divup(m_tensor_slice_extents[i], m_block_dim_sizes[i]);
- }
- m_total_block_count = array_prod(block_count);
-
- // Calculate block strides (used for enumerating blocks).
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- m_block_strides[0] = 1;
- m_tensor_strides[0] = 1;
- for (int i = 1; i < NumDims; ++i) {
- m_block_strides[i] = m_block_strides[i - 1] * block_count[i - 1];
- m_tensor_strides[i] =
- m_tensor_strides[i - 1] * m_tensor_dimensions[i - 1];
- }
- } else {
- m_block_strides[NumDims - 1] = 1;
- m_tensor_strides[NumDims - 1] = 1;
- for (int i = NumDims - 2; i >= 0; --i) {
- m_block_strides[i] = m_block_strides[i + 1] * block_count[i + 1];
- m_tensor_strides[i] =
- m_tensor_strides[i + 1] * m_tensor_dimensions[i + 1];
- }
- }
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Block
- GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
- StorageIndex first_coeff_index = 0;
- DSizes<StorageIndex, NumDims> coords;
- DSizes<StorageIndex, NumDims> sizes;
- DSizes<StorageIndex, NumDims> strides;
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- for (int i = NumDims - 1; i > 0; --i) {
- const Index idx = block_index / m_block_strides[i];
- coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
- sizes[i] = numext::mini(
- m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
- m_block_dim_sizes[i]);
- block_index -= idx * m_block_strides[i];
- first_coeff_index += coords[i] * m_tensor_strides[i];
- }
- coords[0] =
- m_tensor_slice_offsets[0] + block_index * m_block_dim_sizes[0];
- sizes[0] = numext::mini(
- m_tensor_slice_offsets[0] + m_tensor_slice_extents[0] - coords[0],
- m_block_dim_sizes[0]);
- first_coeff_index += coords[0] * m_tensor_strides[0];
-
- StorageIndex prev_dim = m_block_stride_order[0];
- strides[prev_dim] = 1;
- for (int i = 1; i < NumDims; ++i) {
- const StorageIndex curr_dim = m_block_stride_order[i];
- strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
- prev_dim = curr_dim;
- }
- } else {
- for (int i = 0; i < NumDims - 1; ++i) {
- const StorageIndex idx = block_index / m_block_strides[i];
- coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
- sizes[i] = numext::mini(
- m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
- m_block_dim_sizes[i]);
- block_index -= idx * m_block_strides[i];
- first_coeff_index += coords[i] * m_tensor_strides[i];
- }
- coords[NumDims - 1] = m_tensor_slice_offsets[NumDims - 1] +
- block_index * m_block_dim_sizes[NumDims - 1];
- sizes[NumDims - 1] = numext::mini(
- m_tensor_slice_offsets[NumDims - 1] +
- m_tensor_slice_extents[NumDims - 1] - coords[NumDims - 1],
- m_block_dim_sizes[NumDims - 1]);
- first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1];
-
- StorageIndex prev_dim = m_block_stride_order[NumDims - 1];
- strides[prev_dim] = 1;
- for (int i = NumDims - 2; i >= 0; --i) {
- const StorageIndex curr_dim = m_block_stride_order[i];
- strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
- prev_dim = curr_dim;
- }
- }
-
- return Block(first_coeff_index, sizes, strides, m_tensor_strides, data);
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
- return m_total_block_count;
- }
-
- private:
- Dimensions m_tensor_dimensions;
- Dimensions m_tensor_slice_offsets;
- Dimensions m_tensor_slice_extents;
- Dimensions m_tensor_strides;
- Dimensions m_block_dim_sizes;
- Dimensions m_block_stride_order;
- Dimensions m_block_strides;
- StorageIndex m_total_block_count;
-};
-
} // namespace internal
} // namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 58164c13a..80162ad12 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -114,7 +114,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
enum {
IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -123,21 +122,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
- // Block based access to the XprType (input) tensor.
- typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
- TensorBlock;
- typedef internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>
- TensorBlockReader;
-
// We do block based broadcasting using a trick with 2x tensor rank and 0
// strides. See block method implementation for details.
typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
- typedef internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>
- BroadcastTensorBlock;
- typedef internal::TensorBlockReader<ScalarNoConst, Index, 2 * NumDims, Layout>
- BroadcastTensorBlockReader;
-
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
@@ -641,246 +629,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
m_impl.getResourceRequirements(resources);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
- TensorBlock* output_block) const {
- if (NumDims <= 0) {
- output_block->data()[0] = m_impl.coeff(0);
- return;
- }
-
- // Because we only support kSkewedInnerDims blocking, block size should be
- // equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size
- // for the first outer dim, and 1 for other outer dims. This is guaranteed
- // by MergeResourceRequirements() in TensorBlock.h.
- const Dimensions& output_block_sizes = output_block->block_sizes();
- const Dimensions& output_block_strides = output_block->block_strides();
-
- // Find where outer dims start.
- int outer_dim_start = 0;
- Index outer_dim_size = 1, inner_dim_size = 1;
- for (int i = 0; i < NumDims; ++i) {
- const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? i
- : NumDims - i - 1;
- if (i > outer_dim_start) {
- eigen_assert(output_block_sizes[dim] == 1);
- } else if (output_block_sizes[dim] != m_dimensions[dim]) {
- eigen_assert(output_block_sizes[dim] < m_dimensions[dim]);
- outer_dim_size = output_block_sizes[dim];
- } else {
- inner_dim_size *= output_block_sizes[dim];
- ++outer_dim_start;
- }
- }
-
- if (inner_dim_size == 0 || outer_dim_size == 0) {
- return;
- }
-
- const Dimensions& input_dims = Dimensions(m_impl.dimensions());
-
- // Pre-fill input_block_sizes, broadcast_block_sizes,
- // broadcast_block_strides, and broadcast_tensor_strides. Later on we will
- // only modify the outer_dim_start-th dimension on these arrays.
-
- // Calculate the input block size for looking into the input.
- Dimensions input_block_sizes;
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- for (int i = 0; i < outer_dim_start; ++i) {
- input_block_sizes[i] = input_dims[i];
- }
- for (int i = outer_dim_start; i < NumDims; ++i) {
- input_block_sizes[i] = 1;
- }
- } else {
- for (int i = 0; i < outer_dim_start; ++i) {
- input_block_sizes[NumDims - i - 1] = input_dims[NumDims - i - 1];
- }
- for (int i = outer_dim_start; i < NumDims; ++i) {
- input_block_sizes[NumDims - i - 1] = 1;
- }
- }
-
- // Broadcast with the 0-stride trick: Create 1 extra dim for each
- // broadcast, set the input stride to 0.
- //
- // When ColMajor:
- // - broadcast_block_sizes is [d_0, b_0, d_1, b_1, ...].
- //
- // - broadcast_block_strides is [output_block_strides[0],
- // output_block_strides[0] * d_0,
- // output_block_strides[1],
- // output_block_strides[1] * d_1,
- // ...].
- //
- // - broadcast_tensor_strides is [output_block_strides[0],
- // 0,
- // output_block_strides[1],
- // 0,
- // ...].
- BroadcastDimensions broadcast_block_sizes, broadcast_block_strides,
- broadcast_tensor_strides;
-
- for (int i = 0; i < outer_dim_start; ++i) {
- const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? i
- : NumDims - i - 1;
- const int copy_dim =
- static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? 2 * i
- : 2 * NumDims - 2 * i - 1;
- const int broadcast_dim =
- static_cast<int>(Layout) == static_cast<int>(ColMajor) ? copy_dim + 1
- : copy_dim - 1;
- broadcast_block_sizes[copy_dim] = input_dims[dim];
- broadcast_block_sizes[broadcast_dim] = m_broadcast[dim];
- broadcast_block_strides[copy_dim] = output_block_strides[dim];
- broadcast_block_strides[broadcast_dim] =
- output_block_strides[dim] * input_dims[dim];
- broadcast_tensor_strides[copy_dim] = m_inputStrides[dim];
- broadcast_tensor_strides[broadcast_dim] = 0;
- }
- for (int i = 2 * outer_dim_start; i < 2 * NumDims; ++i) {
- const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? i
- : 2 * NumDims - i - 1;
- broadcast_block_sizes[dim] = 1;
- broadcast_block_strides[dim] = 0;
- broadcast_tensor_strides[dim] = 0;
- }
-
- const int outer_dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? outer_dim_start
- : NumDims - outer_dim_start - 1;
-
- if (outer_dim_size == 1) {
- // We just need one block read using the ready-set values above.
- BroadcastBlock(input_block_sizes, broadcast_block_sizes,
- broadcast_block_strides, broadcast_tensor_strides, 0,
- output_block);
- } else if (input_dims[outer_dim] == 1) {
- // Broadcast outer_dim_start-th dimension (< NumDims) by outer_dim_size.
- const int broadcast_outer_dim =
- static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? 2 * outer_dim_start + 1
- : 2 * NumDims - 2 * outer_dim_start - 2;
- broadcast_block_sizes[broadcast_outer_dim] = outer_dim_size;
- broadcast_tensor_strides[broadcast_outer_dim] = 0;
- broadcast_block_strides[broadcast_outer_dim] =
- output_block_strides[outer_dim];
- BroadcastBlock(input_block_sizes, broadcast_block_sizes,
- broadcast_block_strides, broadcast_tensor_strides, 0,
- output_block);
- } else {
- // The general case. Let's denote the output block as x[...,
- // a:a+outer_dim_size, :, ..., :], where a:a+outer_dim_size is a slice on
- // the outer_dim_start-th dimension (< NumDims). We need to split the
- // a:a+outer_dim_size into possibly 3 sub-blocks:
- //
- // (1) a:b, where b is the smallest multiple of
- // input_dims[outer_dim_start] in [a, a+outer_dim_size].
- //
- // (2) b:c, where c is the largest multiple of input_dims[outer_dim_start]
- // in [a, a+outer_dim_size].
- //
- // (3) c:a+outer_dim_size .
- //
- // Or, when b and c do not exist, we just need to process the whole block
- // together.
-
- // Find a.
- const Index outer_dim_left_index =
- output_block->first_coeff_index() / m_outputStrides[outer_dim];
-
- // Find b and c.
- const Index input_outer_dim_size = input_dims[outer_dim];
-
- // First multiple after a. This is b when <= outer_dim_left_index +
- // outer_dim_size.
- const Index first_multiple =
- divup<Index>(outer_dim_left_index, input_outer_dim_size) *
- input_outer_dim_size;
-
- if (first_multiple <= outer_dim_left_index + outer_dim_size) {
- // b exists, so does c. Find it.
- const Index last_multiple = (outer_dim_left_index + outer_dim_size) /
- input_outer_dim_size * input_outer_dim_size;
- const int copy_outer_dim =
- static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? 2 * outer_dim_start
- : 2 * NumDims - 2 * outer_dim_start - 1;
- const int broadcast_outer_dim =
- static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? 2 * outer_dim_start + 1
- : 2 * NumDims - 2 * outer_dim_start - 2;
- if (first_multiple > outer_dim_left_index) {
- const Index head_size = first_multiple - outer_dim_left_index;
- input_block_sizes[outer_dim] = head_size;
- broadcast_block_sizes[copy_outer_dim] = head_size;
- broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
- broadcast_block_strides[copy_outer_dim] =
- output_block_strides[outer_dim];
- broadcast_block_sizes[broadcast_outer_dim] = 1;
- broadcast_tensor_strides[broadcast_outer_dim] = 0;
- broadcast_block_strides[broadcast_outer_dim] =
- output_block_strides[outer_dim] * input_dims[outer_dim];
- BroadcastBlock(input_block_sizes, broadcast_block_sizes,
- broadcast_block_strides, broadcast_tensor_strides, 0,
- output_block);
- }
- if (first_multiple < last_multiple) {
- input_block_sizes[outer_dim] = input_outer_dim_size;
- broadcast_block_sizes[copy_outer_dim] = input_outer_dim_size;
- broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
- broadcast_block_strides[copy_outer_dim] =
- output_block_strides[outer_dim];
- broadcast_block_sizes[broadcast_outer_dim] =
- (last_multiple - first_multiple) / input_outer_dim_size;
- broadcast_tensor_strides[broadcast_outer_dim] = 0;
- broadcast_block_strides[broadcast_outer_dim] =
- output_block_strides[outer_dim] * input_dims[outer_dim];
- const Index offset = (first_multiple - outer_dim_left_index) *
- m_outputStrides[outer_dim];
- BroadcastBlock(input_block_sizes, broadcast_block_sizes,
- broadcast_block_strides, broadcast_tensor_strides,
- offset, output_block);
- }
- if (last_multiple < outer_dim_left_index + outer_dim_size) {
- const Index tail_size =
- outer_dim_left_index + outer_dim_size - last_multiple;
- input_block_sizes[outer_dim] = tail_size;
- broadcast_block_sizes[copy_outer_dim] = tail_size;
- broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
- broadcast_block_strides[copy_outer_dim] =
- output_block_strides[outer_dim];
- broadcast_block_sizes[broadcast_outer_dim] = 1;
- broadcast_tensor_strides[broadcast_outer_dim] = 0;
- broadcast_block_strides[broadcast_outer_dim] =
- output_block_strides[outer_dim] * input_dims[outer_dim];
- const Index offset = (last_multiple - outer_dim_left_index) *
- m_outputStrides[outer_dim];
- BroadcastBlock(input_block_sizes, broadcast_block_sizes,
- broadcast_block_strides, broadcast_tensor_strides,
- offset, output_block);
- }
- } else {
- // b and c do not exist.
- const int copy_outer_dim =
- static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? 2 * outer_dim_start
- : 2 * NumDims - 2 * outer_dim_start - 1;
- input_block_sizes[outer_dim] = outer_dim_size;
- broadcast_block_sizes[copy_outer_dim] = outer_dim_size;
- broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim];
- broadcast_block_strides[copy_outer_dim] =
- output_block_strides[outer_dim];
- BroadcastBlock(input_block_sizes, broadcast_block_sizes,
- broadcast_block_strides, broadcast_tensor_strides, 0,
- output_block);
- }
- }
- }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
@@ -1096,28 +844,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
return params;
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlock(
- const Dimensions& input_block_sizes,
- const BroadcastDimensions& broadcast_block_sizes,
- const BroadcastDimensions& broadcast_block_strides,
- const BroadcastDimensions& broadcast_tensor_strides, Index offset,
- TensorBlock* output_block) const {
- TensorBlock input_view_block(
- static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? indexColMajor(output_block->first_coeff_index() + offset)
- : indexRowMajor(output_block->first_coeff_index() + offset),
- input_block_sizes, Dimensions(m_inputStrides),
- Dimensions(m_inputStrides), NULL);
-
- internal::TensorBlockView<ArgType, Device> input_block(m_device, m_impl,
- input_view_block);
- BroadcastTensorBlock broadcast_block(
- 0, broadcast_block_sizes, broadcast_block_strides,
- broadcast_tensor_strides, output_block->data() + offset);
-
- BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data());
- }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 emptyBlock() const {
DSizes<Index, NumDims> dimensions;
for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 32d6960bf..098110217 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -148,7 +148,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
IsAligned = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
// Chipping of outer-most dimension is a trivial operation, because we can
// read and write directly from the underlying tensor using single offset.
@@ -167,11 +166,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
- typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
- InputTensorBlock;
- typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
- OutputTensorBlock;
-
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
@@ -218,20 +212,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
}
m_inputStride *= input_dims[m_dim.actualDim()];
m_inputOffset = m_stride * op.offset();
-
- if (BlockAccess) {
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- m_inputStrides[0] = 1;
- for (int i = 1; i < NumInputDims; ++i) {
- m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
- }
- } else {
- m_inputStrides[NumInputDims - 1] = 1;
- for (int i = NumInputDims - 2; i >= 0; --i) {
- m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
- }
- }
- }
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -323,52 +303,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
m_impl.getResourceRequirements(resources);
}
- // TODO(andydavis) Reduce the overhead of this function (experiment with
- // using a fixed block size).
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
- OutputTensorBlock* output_block) const {
- // Calculate input block sizes.
- const DSizes<Index, NumDims>& output_block_sizes =
- output_block->block_sizes();
- const DSizes<Index, NumDims>& output_block_strides =
- output_block->block_strides();
- const Index chip_dim = m_dim.actualDim();
- DSizes<Index, NumInputDims> input_block_sizes;
- DSizes<Index, NumInputDims> input_block_strides;
- for (Index i = 0; i < NumInputDims; ++i) {
- if (i < chip_dim) {
- input_block_sizes[i] = output_block_sizes[i];
- input_block_strides[i] = output_block_strides[i];
- } else if (i > chip_dim) {
- input_block_sizes[i] = output_block_sizes[i - 1];
- input_block_strides[i] = output_block_strides[i - 1];
- } else {
- input_block_sizes[i] = 1;
- }
- }
- // Fix up input_block_stride for chip dimension.
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- if (chip_dim == 0) {
- input_block_strides[chip_dim] = 1;
- } else {
- input_block_strides[chip_dim] =
- input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1];
- }
- } else {
- if (chip_dim == NumInputDims - 1) {
- input_block_strides[chip_dim] = 1;
- } else {
- input_block_strides[chip_dim] =
- input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1];
- }
- }
- // Instantiate and read input block from input tensor.
- InputTensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
- input_block_sizes, input_block_strides,
- m_inputStrides, output_block->data());
- m_impl.block(&input_block);
- }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool root_of_expr_ast = false) const {
@@ -482,7 +416,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
Index m_stride;
Index m_inputOffset;
Index m_inputStride;
- DSizes<Index, NumInputDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
const internal::DimensionId<DimId> m_dim;
const Device EIGEN_DEVICE_REF m_device;
@@ -508,7 +441,6 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 26276abaf..aad9d86be 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -125,7 +125,6 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
TensorEvaluator<RightArgType, Device>::PacketAccess,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
@@ -325,7 +324,6 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
TensorEvaluator<RightArgType, Device>::PacketAccess,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 87e8db3fd..4bc7b3942 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -381,7 +381,6 @@ struct TensorContractionEvaluatorBase
enum {
IsAligned = true,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index 2a6d67ad5..027322582 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -302,7 +302,6 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
TensorEvaluator<ArgType, Device>::PacketAccess &
internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
#endif
- BlockAccess = false,
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 8220038c1..44068fedc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -309,7 +309,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
enum {
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<InputArgType, Device>::Layout,
@@ -787,7 +786,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
enum {
IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
PacketAccess = false,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index b660242f4..5c94165d1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -242,7 +242,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
enum {
IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
PacketAccess = false,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index f1f46161e..242533f72 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -95,7 +95,6 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
enum {
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<XprType, Device>::Layout,
@@ -269,7 +268,6 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
enum {
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<LhsXprType, Device>::Layout,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index cd1338c66..722032a3a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -110,7 +110,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = true,
BlockAccessV2 = true,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -120,9 +119,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
static const int NumDims = internal::traits<ArgType>::NumDimensions;
- typedef typename internal::TensorBlock<CoeffReturnType, Index, NumDims, Layout> TensorBlock;
- typedef typename internal::TensorBlockReader<CoeffReturnType, Index, NumDims, Layout> TensorBlockReader;
-
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
@@ -173,13 +169,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
m_impl.getResourceRequirements(resources);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
- TensorBlock eval_to_block(block->first_coeff_index(), block->block_sizes(),
- block->tensor_strides(), block->tensor_strides(),
- m_buffer + block->first_coeff_index());
- m_impl.block(&eval_to_block);
- }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2(
TensorBlockDesc& desc, TensorBlockScratch& scratch) {
// Add `m_buffer` as destination buffer to the block descriptor.
@@ -216,11 +205,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
- assert(m_buffer != NULL);
- TensorBlockReader::Run(block, m_buffer);
- }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
// We assume that evalPacket or evalScalar is called to perform the
// assignment and account for the cost of the write here.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index ce2305b56..d6a3e6abe 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -45,7 +45,6 @@ struct TensorEvaluator
enum {
IsAligned = Derived::IsAligned,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
- BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
BlockAccessV2 = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
PreferBlockAccess = false,
Layout = Derived::Layout,
@@ -55,13 +54,6 @@ struct TensorEvaluator
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
- typedef typename internal::TensorBlock<ScalarNoConst, Index, NumCoords, Layout>
- TensorBlock;
- typedef typename internal::TensorBlockReader<ScalarNoConst, Index, NumCoords, Layout>
- TensorBlockReader;
- typedef typename internal::TensorBlockWriter<ScalarNoConst, Index, NumCoords, Layout>
- TensorBlockWriter;
-
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
@@ -160,11 +152,6 @@ struct TensorEvaluator
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
std::vector<internal::TensorOpResourceRequirements>*) const {}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
- assert(m_data != NULL);
- TensorBlockReader::Run(block, m_data);
- }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
@@ -172,12 +159,6 @@ struct TensorEvaluator
return TensorBlockV2::materialize(m_data, m_dims, desc, scratch);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
- const TensorBlock& block) {
- assert(m_data != NULL);
- TensorBlockWriter::Run(block, m_data);
- }
-
template<typename TensorBlockV2>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
const TensorBlockDesc& desc, const TensorBlockV2& block) {
@@ -263,7 +244,6 @@ struct TensorEvaluator<const Derived, Device>
enum {
IsAligned = Derived::IsAligned,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
- BlockAccess = internal::is_arithmetic<ScalarNoConst>::value,
BlockAccessV2 = internal::is_arithmetic<ScalarNoConst>::value,
PreferBlockAccess = false,
Layout = Derived::Layout,
@@ -271,11 +251,6 @@ struct TensorEvaluator<const Derived, Device>
RawAccess = true
};
- typedef typename internal::TensorBlock<ScalarNoConst, Index, NumCoords, Layout>
- TensorBlock;
- typedef typename internal::TensorBlockReader<ScalarNoConst, Index, NumCoords, Layout>
- TensorBlockReader;
-
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
@@ -348,11 +323,6 @@ struct TensorEvaluator<const Derived, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
std::vector<internal::TensorOpResourceRequirements>*) const {}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
- assert(m_data != NULL);
- TensorBlockReader::Run(block, m_data);
- }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
@@ -404,7 +374,6 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
&& (PacketType<CoeffReturnType, Device>::size >1)
#endif
,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -475,7 +444,6 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
internal::functor_traits<UnaryOp>::PacketAccess,
- BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -554,24 +522,6 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
m_argImpl.getResourceRequirements(resources);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
- TensorBlock* output_block) const {
- if (NumDims <= 0) {
- output_block->data()[0] = coeff(0);
- return;
- }
- internal::TensorBlockView<ArgType, Device> arg_block(m_device, m_argImpl,
- *output_block);
- internal::TensorBlockCwiseUnaryIO<UnaryOp, Index, ScalarNoConst, NumDims,
- Layout>::Run(m_functor,
- output_block->block_sizes(),
- output_block
- ->block_strides(),
- output_block->data(),
- arg_block.block_strides(),
- arg_block.data());
- }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
@@ -608,8 +558,6 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
TensorEvaluator<RightArgType, Device>::PacketAccess &
internal::functor_traits<BinaryOp>::PacketAccess,
- BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
- TensorEvaluator<RightArgType, Device>::BlockAccess,
BlockAccessV2 = TensorEvaluator<LeftArgType, Device>::BlockAccessV2 &
TensorEvaluator<RightArgType, Device>::BlockAccessV2,
PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
@@ -713,24 +661,6 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
m_rightImpl.getResourceRequirements(resources);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
- TensorBlock* output_block) const {
- if (NumDims <= 0) {
- output_block->data()[0] = coeff(Index(0));
- return;
- }
- internal::TensorBlockView<LeftArgType, Device> left_block(
- m_device, m_leftImpl, *output_block);
- internal::TensorBlockView<RightArgType, Device> right_block(
- m_device, m_rightImpl, *output_block);
- internal::TensorBlockCwiseBinaryIO<
- BinaryOp, Index, typename internal::remove_const<Scalar>::type, NumDims,
- Layout>::Run(m_functor, output_block->block_sizes(),
- output_block->block_strides(), output_block->data(),
- left_block.block_strides(), left_block.data(),
- right_block.block_strides(), right_block.data());
- }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
@@ -768,7 +698,6 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
TensorEvaluator<Arg2Type, Device>::PacketAccess &&
TensorEvaluator<Arg3Type, Device>::PacketAccess &&
internal::functor_traits<TernaryOp>::PacketAccess,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = TensorEvaluator<Arg1Type, Device>::PreferBlockAccess ||
TensorEvaluator<Arg2Type, Device>::PreferBlockAccess ||
@@ -887,7 +816,6 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess &
TensorEvaluator<ElseArgType, Device>::PacketAccess &
PacketType<Scalar, Device>::HasBlend,
- BlockAccess = false,
BlockAccessV2 = TensorEvaluator<IfArgType, Device>::BlockAccessV2 &&
TensorEvaluator<ThenArgType, Device>::BlockAccessV2 &&
TensorEvaluator<ElseArgType, Device>::BlockAccessV2,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 0fb0a9227..9926046b9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -159,70 +159,6 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
*/
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, DefaultDevice, Vectorizable,
- /*Tiling=*/TiledEvaluation::Legacy> {
- public:
- typedef typename traits<Expression>::Scalar Scalar;
- typedef typename remove_const<Scalar>::type ScalarNoConst;
-
- typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
- typedef typename traits<Expression>::Index StorageIndex;
-
- static const int NumDims = traits<Expression>::NumDimensions;
-
- EIGEN_DEVICE_FUNC
- static EIGEN_STRONG_INLINE void run(const Expression& expr,
- const DefaultDevice& device = DefaultDevice()) {
- typedef TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlock;
- typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlockMapper;
- typedef typename TensorBlock::Dimensions TensorBlockDimensions;
-
- Evaluator evaluator(expr, device);
- Index total_size = array_prod(evaluator.dimensions());
- Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
-
- if (total_size < cache_size
- && !ExpressionHasTensorBroadcastingOp<Expression>::value) {
- // TODO(andydavis) Reduce block management overhead for small tensors.
- internal::TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tiling=*/TiledEvaluation::Off>::run(expr,device);
- evaluator.cleanup();
- return;
- }
-
- const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
- if (needs_assign) {
- // Size tensor blocks to fit in cache (or requested target block size).
- Index block_total_size = numext::mini(cache_size, total_size);
- TensorBlockShapeType block_shape = kSkewedInnerDims;
- // Query expression tree for desired block size/shape.
- std::vector<TensorOpResourceRequirements> resources;
- evaluator.getResourceRequirements(&resources);
- MergeResourceRequirements(resources, &block_shape, &block_total_size);
-
- TensorBlockMapper block_mapper(
- TensorBlockDimensions(evaluator.dimensions()), block_shape,
- block_total_size);
- block_total_size = block_mapper.block_dims_total_size();
-
- ScalarNoConst* data = static_cast<ScalarNoConst*>(
- device.allocate(block_total_size * sizeof(Scalar)));
-
- const StorageIndex total_block_count = block_mapper.total_block_count();
- for (StorageIndex i = 0; i < total_block_count; ++i) {
- TensorBlock block = block_mapper.GetBlockForIndex(i, data);
- evaluator.evalBlock(&block);
- }
- device.deallocate(data);
- }
- evaluator.cleanup();
- }
-};
-
-/**
- * Process all the data with a single cpu thread, using blocks of data. By
- * sizing a block to fit L1 cache we get better cache performance.
- */
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, DefaultDevice, Vectorizable,
/*Tiling=*/TiledEvaluation::On> {
public:
typedef typename traits<Expression>::Scalar Scalar;
@@ -448,59 +384,6 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
- /*Tiling=*/TiledEvaluation::Legacy> {
- public:
- typedef typename traits<Expression>::Index StorageIndex;
- typedef typename traits<Expression>::Scalar Scalar;
- typedef typename remove_const<Scalar>::type ScalarNoConst;
-
- static const int NumDims = traits<Expression>::NumDimensions;
-
- typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
- typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> BlockMapper;
- typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
-
- static EIGEN_STRONG_INLINE void run(const Expression& expr,
- const ThreadPoolDevice& device) {
- Evaluator evaluator(expr, device);
- Index total_size = array_prod(evaluator.dimensions());
- Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
-
- if (total_size < cache_size &&
- !ExpressionHasTensorBroadcastingOp<Expression>::value) {
- // TODO(andydavis) Reduce block management overhead for small tensors.
- internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
- /*Tiling=*/TiledEvaluation::Off>::run(expr,
- device);
- evaluator.cleanup();
- return;
- }
-
- const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
- if (needs_assign) {
- const TilingContext tiling =
- internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
- Vectorizable>(device, evaluator);
-
- device.parallelFor(
- tiling.block_mapper.total_block_count(), tiling.cost,
- [=, &device, &evaluator, &tiling](StorageIndex firstIdx,
- StorageIndex lastIdx) {
- ScalarNoConst* thread_buf =
- tiling.template GetCurrentThreadBuffer<ScalarNoConst>(device);
- for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
- auto block = tiling.block_mapper.GetBlockForIndex(i, thread_buf);
- evaluator.evalBlock(&block);
- }
- });
- device.deallocate(tiling.buffer);
- }
- evaluator.cleanup();
- }
-};
-
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
/*Tiling=*/TiledEvaluation::On> {
public:
typedef typename traits<Expression>::Index IndexType;
@@ -605,91 +488,6 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
template <typename Expression, typename DoneCallback, bool Vectorizable>
class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
- Vectorizable, /*Tileable*/ TiledEvaluation::Legacy> {
- public:
- typedef typename traits<Expression>::Index StorageIndex;
- typedef typename traits<Expression>::Scalar Scalar;
- typedef typename remove_const<Scalar>::type ScalarNoConst;
-
- static const int NumDims = traits<Expression>::NumDimensions;
-
- typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
- typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims,
- Evaluator::Layout>
- BlockMapper;
- typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
-
- static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
- const ThreadPoolDevice& device,
- DoneCallback done) {
- TensorAsyncExecutorContext* const ctx =
- new TensorAsyncExecutorContext(expr, device, std::move(done));
-
- Index total_size = array_prod(ctx->evaluator.dimensions());
- Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
-
- if (total_size < cache_size &&
- !ExpressionHasTensorBroadcastingOp<Expression>::value) {
- auto delete_ctx = [ctx]() { delete ctx; };
- internal::TensorAsyncExecutor<
- Expression, ThreadPoolDevice, decltype(delete_ctx), Vectorizable,
- /*Tileable*/ TiledEvaluation::Off>::runAsync(expr, device, std::move(delete_ctx));
- return;
- }
-
- const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
- if (!need_assign) {
- delete ctx;
- return;
- }
-
- ctx->tiling =
- GetTensorExecutorTilingContext<Evaluator, BlockMapper, Vectorizable>(
- device, ctx->evaluator);
-
- auto eval_block = [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
- ScalarNoConst* thread_buf =
- ctx->tiling.template GetCurrentThreadBuffer<ScalarNoConst>(
- ctx->device);
- for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
- auto block = ctx->tiling.block_mapper.GetBlockForIndex(i, thread_buf);
- ctx->evaluator.evalBlock(&block);
- }
- };
- device.parallelForAsync(ctx->tiling.block_mapper.total_block_count(),
- ctx->tiling.cost, eval_block,
- [ctx]() { delete ctx; });
- };
-
- ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
- }
-
- private:
- struct TensorAsyncExecutorContext {
- TensorAsyncExecutorContext(const Expression& expr,
- const ThreadPoolDevice& thread_pool,
- DoneCallback done)
- : device(thread_pool),
- evaluator(expr, thread_pool),
- on_done(std::move(done)) {}
-
- ~TensorAsyncExecutorContext() {
- device.deallocate(tiling.buffer);
- evaluator.cleanup();
- on_done();
- }
-
- const ThreadPoolDevice& device;
- Evaluator evaluator;
- TilingContext tiling;
-
- private:
- DoneCallback on_done;
- };
-};
-
-template <typename Expression, typename DoneCallback, bool Vectorizable>
-class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
Vectorizable, /*Tileable*/ TiledEvaluation::On> {
public:
typedef typename traits<Expression>::Index IndexType;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index 55c7d6831..a8841bc38 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -133,7 +133,6 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
enum {
IsAligned = false,
PacketAccess = true,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index 5f06c97ab..ea3ea2c91 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -41,7 +41,6 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
enum {
IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index e5b67a18c..8d17d4b76 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -96,7 +96,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
enum {
IsAligned = true,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
- BlockAccess = internal::is_arithmetic<CoeffReturnType>::value,
BlockAccessV2 = internal::is_arithmetic<CoeffReturnType>::value,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -105,11 +104,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
static const int NumDims = internal::traits<ArgType>::NumDimensions;
- typedef typename internal::TensorBlock<CoeffReturnType, Index, NumDims, Layout>
- TensorBlock;
- typedef typename internal::TensorBlockReader<CoeffReturnType, Index, NumDims, Layout>
- TensorBlockReader;
-
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
@@ -185,11 +179,6 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
std::vector<internal::TensorOpResourceRequirements>*) const {}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
- assert(m_buffer != NULL);
- TensorBlockReader::Run(block, m_buffer);
- }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 0da2d9e0d..389d5d906 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -158,7 +158,6 @@ struct IsVectorizable<GpuDevice, Expression> {
enum TiledEvaluation {
Off = 0, // tiled evaluation is not supported
On = 1, // still work in progress (see TensorBlockV2.h)
- Legacy = 2 // soon to be deprecated (see TensorBock.h)
};
template <typename Device, typename Expression>
@@ -166,18 +165,12 @@ struct IsTileable {
// Check that block evaluation is supported and it's a preferred option (at
// least one sub-expression has much faster block evaluation, e.g.
// broadcasting).
- static const bool BlockAccess =
- TensorEvaluator<Expression, Device>::BlockAccess &&
- TensorEvaluator<Expression, Device>::PreferBlockAccess;
-
static const bool BlockAccessV2 =
TensorEvaluator<Expression, Device>::BlockAccessV2 &&
TensorEvaluator<Expression, Device>::PreferBlockAccess;
static const TiledEvaluation value =
- BlockAccessV2
- ? TiledEvaluation::On
- : (BlockAccess ? TiledEvaluation::Legacy : TiledEvaluation::Off);
+ BlockAccessV2 ? TiledEvaluation::On : TiledEvaluation::Off;
};
template <typename Expression, typename Device,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index 7f57281a0..77fa32dc7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -93,7 +93,6 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
- BlockAccess = true,
BlockAccessV2 = true,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -183,60 +182,6 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
Index count;
};
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
- TensorBlock* output_block) const {
- if (NumDims <= 0) return;
-
- static const bool is_col_major =
- static_cast<int>(Layout) == static_cast<int>(ColMajor);
-
- // Compute spatial coordinates for the first block element.
- array<Index, NumDims> coords;
- extract_coordinates(output_block->first_coeff_index(), coords);
- array<Index, NumDims> initial_coords = coords;
-
- CoeffReturnType* data = output_block->data();
- Index offset = 0;
-
- // Initialize output block iterator state. Dimension in this array are
- // always in inner_most -> outer_most order (col major layout).
- array<BlockIteratorState, NumDims> it;
- for (Index i = 0; i < NumDims; ++i) {
- const Index dim = is_col_major ? i : NumDims - 1 - i;
- it[i].size = output_block->block_sizes()[dim];
- it[i].stride = output_block->block_strides()[dim];
- it[i].span = it[i].stride * (it[i].size - 1);
- it[i].count = 0;
- }
- eigen_assert(it[0].stride == 1);
-
- while (it[NumDims - 1].count < it[NumDims - 1].size) {
- // Generate data for the inner-most dimension.
- for (Index i = 0; i < it[0].size; ++i) {
- *(data + offset + i) = m_generator(coords);
- coords[is_col_major ? 0 : NumDims - 1]++;
- }
- coords[is_col_major ? 0 : NumDims - 1] =
- initial_coords[is_col_major ? 0 : NumDims - 1];
-
- // For the 1d tensor we need to generate only one inner-most dimension.
- if (NumDims == 1) break;
-
- // Update offset.
- for (Index i = 1; i < NumDims; ++i) {
- if (++it[i].count < it[i].size) {
- offset += it[i].stride;
- coords[is_col_major ? i : NumDims - 1 - i]++;
- break;
- }
- if (i != NumDims - 1) it[i].count = 0;
- coords[is_col_major ? i : NumDims - 1 - i] =
- initial_coords[is_col_major ? i : NumDims - 1 - i];
- offset -= it[i].span;
- }
- }
- }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 38bf80c5d..49bc60f0a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -231,7 +231,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = true,
BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -541,139 +540,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
internal::kSkewedInnerDims, block_total_size_max));
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
- OutputTensorBlock* output_block) const {
- typedef internal::ImagePatchCopyOp<Self, PacketAccess> ImagePatchCopyOp;
- typedef internal::ImagePatchPaddingOp<Self> ImagePatchPaddingOp;
-
- // Calculate loop limits and various input/output dim sizes.
- const DSizes<Index, NumDims>& block_sizes = output_block->block_sizes();
- const bool col_major =
- static_cast<int>(Layout) == static_cast<int>(ColMajor);
- const Index depth_dim_size = block_sizes[col_major ? 0 : NumDims - 1];
- const Index output_depth_dim_size =
- m_dimensions[col_major ? 0 : NumDims - 1];
- const Index row_dim_size = block_sizes[col_major ? 1 : NumDims - 2];
- const Index output_row_dim_size = m_dimensions[col_major ? 1 : NumDims - 2];
- const Index col_dim_size = block_sizes[col_major ? 2 : NumDims - 3];
- const Index block_col_stride = row_dim_size * depth_dim_size;
- const Index patch_index_dim_size = block_sizes[col_major ? 3 : NumDims - 4];
- const Index outer_dim_size =
- block_sizes.TotalSize() /
- (depth_dim_size * row_dim_size * col_dim_size * patch_index_dim_size);
-
- const Index patch_size = row_dim_size * col_dim_size * depth_dim_size;
- const Index batch_size = patch_size * patch_index_dim_size;
-
- Index output_index = output_block->first_coeff_index();
-
- // Loop through outer dimensions.
- for (Index outer_dim_index = 0; outer_dim_index < outer_dim_size;
- ++outer_dim_index) {
- const Index outer_output_base_index = outer_dim_index * batch_size;
- // Find the offset of the element wrt the location of the first element.
- const Index patchIndexStart = output_index / m_fastPatchStride;
- const Index patchOffset =
- (output_index - patchIndexStart * m_patchStride) / m_fastOutputDepth;
- const Index colOffsetStart = patchOffset / m_fastColStride;
- // Other ways to index this element.
- const Index otherIndex =
- (NumDims == 4) ? 0 : output_index / m_fastOtherStride;
- const Index patch2DIndexStart =
- (NumDims == 4)
- ? 0
- : (output_index - otherIndex * m_otherStride) / m_fastPatchStride;
- // Calculate starting depth index.
- const Index depth = output_index - (output_index / m_fastOutputDepth) *
- output_depth_dim_size;
- const Index patch_input_base_index =
- depth + otherIndex * m_patchInputStride;
-
- // Loop through patches.
- for (Index patch_index_dim_index = 0;
- patch_index_dim_index < patch_index_dim_size;
- ++patch_index_dim_index) {
- const Index patch_output_base_index =
- outer_output_base_index + patch_index_dim_index * patch_size;
- // Patch index corresponding to the passed in index.
- const Index patchIndex = patchIndexStart + patch_index_dim_index;
- const Index patch2DIndex =
- (NumDims == 4) ? patchIndex
- : patch2DIndexStart + patch_index_dim_index;
- const Index colIndex = patch2DIndex / m_fastOutputRows;
- const Index input_col_base = colIndex * m_col_strides;
- const Index row_offset_base =
- (patch2DIndex - colIndex * m_outputRows) * m_row_strides -
- m_rowPaddingTop;
-
- // Loop through columns.
- for (Index col_dim_index = 0; col_dim_index < col_dim_size;
- ++col_dim_index) {
- const Index col_output_base_index =
- patch_output_base_index + col_dim_index * block_col_stride;
-
- // Calculate col index in the input original tensor.
- Index colOffset = colOffsetStart + col_dim_index;
- Index inputCol =
- input_col_base + colOffset * m_in_col_strides - m_colPaddingLeft;
- Index origInputCol =
- (m_col_inflate_strides == 1)
- ? inputCol
- : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0);
-
- bool pad_column = false;
- if (inputCol < 0 || inputCol >= m_input_cols_eff ||
- ((m_col_inflate_strides != 1) &&
- (inputCol != origInputCol * m_col_inflate_strides))) {
- pad_column = true;
- }
-
- const Index col_input_base_index =
- patch_input_base_index + origInputCol * m_colInputStride;
- const Index input_row_base =
- row_offset_base +
- ((patchOffset + col_dim_index * output_row_dim_size) -
- colOffset * m_colStride) *
- m_in_row_strides;
- // Loop through rows.
- for (Index row_dim_index = 0; row_dim_index < row_dim_size;
- ++row_dim_index) {
- const Index output_base_index =
- col_output_base_index + row_dim_index * depth_dim_size;
- bool pad_row = false;
- Index inputIndex;
- if (!pad_column) {
- Index inputRow =
- input_row_base + row_dim_index * m_in_row_strides;
- Index origInputRow =
- (m_row_inflate_strides == 1)
- ? inputRow
- : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride)
- : 0);
- if (inputRow < 0 || inputRow >= m_input_rows_eff ||
- ((m_row_inflate_strides != 1) &&
- (inputRow != origInputRow * m_row_inflate_strides))) {
- pad_row = true;
- } else {
- inputIndex =
- col_input_base_index + origInputRow * m_rowInputStride;
- }
- }
- // Copy (or pad) along depth dimension.
- if (pad_column || pad_row) {
- ImagePatchPaddingOp::Run(depth_dim_size, Scalar(m_paddingValue),
- output_base_index, output_block->data());
- } else {
- ImagePatchCopyOp::Run(*this, depth_dim_size, output_base_index,
- output_block->data(), inputIndex);
- }
- }
- }
- }
- output_index += m_otherStride;
- }
- }
-
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index e1df84a1d..ef6b62620 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -92,7 +92,6 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index 1da7a4e23..695726e10 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -119,7 +119,6 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
@@ -199,7 +198,6 @@ template<typename ArgType, typename Device>
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 781f1d75b..700337539 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -135,11 +135,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- // TODO(andydavis, wuke) Enable BlockAccess for the general case when the
- // performance issue with block-based reshape is resolved.
- BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess &&
- TensorEvaluator<ArgType, Device>::RawAccess &&
- NumInputDims > 0 && NumOutputDims > 0,
// For trivial reshapes with raw access to underlying data we will provide
// zero overhead block access.
// TODO(ezhulenev): Consider adding block access without raw access?
@@ -153,14 +148,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
- typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
- InputTensorBlock;
- typedef internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>
- OutputTensorBlock;
- typedef internal::TensorBlockReader<ScalarNoConst, Index, NumOutputDims,
- Layout>
- OutputTensorBlockReader;
-
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumOutputDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
@@ -177,30 +164,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
// The total size of the reshaped tensor must be equal to the total size
// of the input tensor.
eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
-
- if (BlockAccess) {
- const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims =
- m_impl.dimensions();
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- m_outputStrides[0] = 1;
- for (int i = 1; i < NumOutputDims; ++i) {
- m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
- }
- m_inputStrides[0] = 1;
- for (int i = 1; i < NumInputDims; ++i) {
- m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
- }
- } else {
- m_outputStrides[NumOutputDims - 1] = 1;
- for (int i = NumOutputDims - 2; i >= 0; --i) {
- m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
- }
- m_inputStrides[NumInputDims - 1] = 1;
- for (int i = NumInputDims - 2; i >= 0; --i) {
- m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
- }
- }
- }
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -249,128 +212,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
Index size;
Index count;
};
- // TODO(andydavis) Reduce the overhead of this function.
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
- OutputTensorBlock* output_block) const {
- if (m_impl.data() != NULL) {
- OutputTensorBlockReader::Run(output_block, m_impl.data());
- return;
- }
-
- // Calculate output block unit-stride inner dimension length.
- const DSizes<Index, NumOutputDims>& output_block_sizes =
- output_block->block_sizes();
- Index output_inner_dim_size = 1;
- Index output_outer_dim_start = NumOutputDims;
- for (Index i = 0; i < NumOutputDims; ++i) {
- const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? i : NumOutputDims - i - 1;
- output_inner_dim_size *= output_block_sizes[dim];
- if (output_block_sizes[dim] < m_dimensions[dim]) {
- output_outer_dim_start = i + 1;
- break;
- }
- }
-
- // Initialize output block iterator state.
- array<BlockIteratorState, NumOutputDims> block_iter_state;
-
- for (Index i = 0; i < NumOutputDims; ++i) {
- const Index dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? i : NumOutputDims - i - 1;
- block_iter_state[i].size = output_block_sizes[dim];
- block_iter_state[i].stride = m_outputStrides[dim];
- block_iter_state[i].span =
- block_iter_state[i].stride * (block_iter_state[i].size - 1);
- block_iter_state[i].count = 0;
- }
-
- const Index output_outer_dim_size = output_block_sizes.TotalSize() /
- output_inner_dim_size;
- const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims =
- m_impl.dimensions();
-
- Index index = output_block->first_coeff_index();
- for (Index outer_idx = 0; outer_idx < output_outer_dim_size; ++outer_idx) {
- Index inner_idx = 0;
- while (inner_idx < output_inner_dim_size) {
- // Calculate input coords based on 'index'.
- array<Index, NumInputDims> input_coords;
- Index idx = index;
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- for (int i = NumInputDims - 1; i > 0; --i) {
- input_coords[i] = idx / m_inputStrides[i];
- idx -= input_coords[i] * m_inputStrides[i];
- }
- input_coords[0] = idx;
- } else {
- for (int i = 0; i < NumInputDims - 1; ++i) {
- input_coords[i] = idx / m_inputStrides[i];
- idx -= input_coords[i] * m_inputStrides[i];
- }
- input_coords[NumInputDims - 1] = idx;
- }
-
- // Calculate target input block shape, using at most
- // 'output_inner_dim_size' coefficients along the input block's inner
- // dimensions.
- DSizes<Index, NumInputDims> input_block_sizes;
- Index num_to_allocate = output_inner_dim_size - inner_idx;
- for (Index i = 0; i < NumInputDims; ++i) {
- const Index dim =
- static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? i : NumInputDims - i - 1;
- input_block_sizes[dim] = numext::mini(
- num_to_allocate, (static_cast<Index>(input_dims[dim]) -
- input_coords[dim]));
- if (input_coords[dim] == 0) {
- num_to_allocate /= input_block_sizes[dim];
- } else {
- num_to_allocate = 1;
- }
- }
-
- // Calculate input block strides.
- DSizes<Index, NumInputDims> input_block_strides;
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- input_block_strides[0] = 1;
- for (int i = 1; i < NumInputDims; ++i) {
- input_block_strides[i] = input_block_strides[i - 1] *
- input_block_sizes[i - 1];
- }
- } else {
- input_block_strides[NumInputDims - 1] = 1;
- for (int i = NumInputDims - 2; i >= 0; --i) {
- input_block_strides[i] = input_block_strides[i + 1] *
- input_block_sizes[i + 1];
- }
- }
-
- // Instantiate and read input block from input tensor.
- InputTensorBlock input_block(index, input_block_sizes,
- input_block_strides, m_inputStrides,
- output_block->data() + outer_idx *
- output_inner_dim_size + inner_idx);
-
- m_impl.block(&input_block);
-
- const Index input_block_total_size = input_block_sizes.TotalSize();
- index += input_block_total_size;
- inner_idx += input_block_total_size;
- }
- eigen_assert(inner_idx == output_inner_dim_size);
- index -= output_inner_dim_size;
- // Update index.
- for (Index i = output_outer_dim_start; i < NumOutputDims; ++i) {
- if (++block_iter_state[i].count < block_iter_state[i].size) {
- index += block_iter_state[i].stride;
- break;
- }
- block_iter_state[i].count = 0;
- index -= block_iter_state[i].span;
- }
- }
- }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
@@ -408,8 +249,6 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
protected:
TensorEvaluator<ArgType, Device> m_impl;
NewDimensions m_dimensions;
- DSizes<Index, NumOutputDims> m_outputStrides;
- DSizes<Index, NumInputDims> m_inputStrides;
};
@@ -426,7 +265,6 @@ template<typename NewDimensions, typename ArgType, typename Device>
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -619,7 +457,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
// slice offsets and sizes.
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -714,7 +551,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
}
}
// Use memcpy if it's going to be faster than using the regular evaluation.
- const MemcpyTriggerForSlicing<Index, Device, BlockAccess> trigger(m_device);
+ const MemcpyTriggerForSlicing<Index, Device, BlockAccessV2> trigger(m_device);
if (trigger(internal::array_prod(dimensions()), contiguous_values)) {
EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data();
for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
@@ -808,16 +645,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
m_impl.getResourceRequirements(resources);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
- TensorBlock* output_block) const {
- TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
- output_block->block_sizes(),
- output_block->block_strides(),
- TensorBlockDimensions(m_inputStrides),
- output_block->data());
- m_impl.block(&input_block);
- }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
@@ -922,7 +749,6 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -1124,7 +950,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
// slice offsets and sizes.
IsAligned = false,
PacketAccess = false,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -1306,7 +1131,6 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
enum {
IsAligned = false,
PacketAccess = false,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 1104f02c7..4a22922d9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -98,7 +98,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
enum {
IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 80afcff0f..4abe58ecd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -96,7 +96,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index d826cfb7e..84604cf41 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -584,7 +584,6 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
enum {
IsAligned = false,
PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -594,11 +593,6 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
- typedef internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>
- OutputTensorBlock;
- typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
- InputTensorBlock;
-
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlockV2;
//===--------------------------------------------------------------------===//
@@ -920,258 +914,6 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
m_impl.getResourceRequirements(resources);
}
- EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void block(
- OutputTensorBlock* output_block) const {
- // Special case full reductions to avoid input block copy below.
- if (NumInputDims == NumReducedDims) {
- eigen_assert(output_block->first_coeff_index() == 0);
- eigen_assert(output_block->block_sizes().TotalSize() == 1);
- Op reducer(m_reducer);
- output_block->data()[0] = internal::InnerMostDimReducer<Self, Op>::reduce(
- *this, 0, m_numValuesToReduce, reducer);
- return;
- }
-
- // Calculate input tensor 'slice' required to reduce output block coeffs.
- DSizes<Index, NumInputDims> input_slice_sizes(m_impl.dimensions());
- for (int i = 0; i < NumOutputDims; ++i) {
- // Clip preserved input dimensions by output block size.
- input_slice_sizes[m_output_to_input_dim_map[i]] =
- output_block->block_sizes()[i];
- }
-
- // Shard input tensor slice into blocks (because it could be large if we
- // need to reduce along several dimensions to calculate required output
- // coefficients).
- const Index max_coeff_count =
- numext::mini<Index>(((m_device.firstLevelCacheSize()) / sizeof(Scalar)),
- input_slice_sizes.TotalSize());
-
- // Calculate max output shard size needed to keep working set of reducers
- // in L1, while leaving enough space for reducer overhead and 'PacketSize'
- // reductions.
- DSizes<Index, NumInputDims> target_input_block_sizes;
- CalculateTargetInputBlockShape(max_coeff_count, input_slice_sizes,
- &target_input_block_sizes);
- // Calculate indices for first preserved dimension.
- const Index first_preserved_dim_output_index =
- static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? 0
- : NumOutputDims - 1;
- const Index first_preserved_dim_input_index =
- m_output_to_input_dim_map[first_preserved_dim_output_index];
- const bool inner_most_dim_preserved =
- PreservingInnerMostDims ||
- (first_preserved_dim_input_index ==
- (static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? 0
- : NumInputDims - 1));
-
- // Calculate output block inner/outer dimension sizes.
- const Index output_block_inner_dim_size =
- output_block->block_sizes()[first_preserved_dim_output_index];
- const Index output_block_outer_dim_size =
- output_block->block_sizes().TotalSize() / output_block_inner_dim_size;
- // Calculate shard size for first preserved dimension.
- const Index output_shard_size =
- target_input_block_sizes[first_preserved_dim_input_index];
- const Index num_output_shards =
- (output_block_inner_dim_size + output_shard_size - 1) /
- output_shard_size;
-
- // Initialize 'tensor_slice_offsets' from input coords of output index.
- DSizes<Index, NumInputDims> tensor_slice_offsets;
- GetInputCoordsForOutputIndex(output_block->first_coeff_index(),
- &tensor_slice_offsets);
-
- // Store tensor slice offset in first preserved dimension to be used
- // to update tensor slice extents in loop below.
- const Index first_preserved_dim_offset_start =
- tensor_slice_offsets[first_preserved_dim_input_index];
-
- array<BlockIteratorState, NumOutputDims> block_iter_state;
-
- // Initialize state used to iterate through output coefficients
- // and update 'tensor_slice_offsets' in outer preserved dims.
- for (int i = 0; i < NumOutputDims - 1; ++i) {
- const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
- ? i + 1
- : NumOutputDims - i - 2;
- block_iter_state[i].input_dim = m_output_to_input_dim_map[dim];
- block_iter_state[i].output_size = output_block->block_sizes()[dim];
- block_iter_state[i].output_count = 0;
- }
-
- // Allocate input block memory.
- ScalarNoConst* input_block_data = static_cast<ScalarNoConst*>(
- m_device.allocate(max_coeff_count * sizeof(Scalar)));
- // Allocate reducer memory.
- const bool packet_reductions_enabled =
- (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess);
- const Index num_reducers =
- (inner_most_dim_preserved && packet_reductions_enabled)
- ? (output_shard_size / PacketSize + output_shard_size % PacketSize +
- PacketSize)
- : output_shard_size;
- typedef internal::BlockReducer<Self, Op> BlockReducer;
- BlockReducer* reducers = static_cast<BlockReducer*>(
- m_device.allocate(num_reducers * sizeof(BlockReducer)));
-
- InputDimensions input_tensor_dims(m_impl.dimensions());
- for (Index output_outer_index = 0;
- output_outer_index < output_block_outer_dim_size;
- ++output_outer_index) {
- for (Index output_shard_index = 0; output_shard_index < num_output_shards;
- ++output_shard_index) {
- // Initialize 'tensor_slice_extents' for this output shard.
- DSizes<Index, NumInputDims> tensor_slice_extents(input_slice_sizes);
- for (int i = 0; i < NumInputDims; ++i) {
- if (i == first_preserved_dim_input_index) {
- // Clip first preserved dim size to output shard size.
- tensor_slice_extents[i] = numext::mini(
- output_shard_size,
- input_slice_sizes[i] - (tensor_slice_offsets[i] -
- first_preserved_dim_offset_start));
-
- } else if (!m_reduced[i]) {
- // Clip outer preserved dims to size 1, so that we reduce a
- // contiguous set of output coefficients.
- tensor_slice_extents[i] = 1;
- }
- }
-
- // Initialize output coefficient reducers.
- for (int i = 0; i < num_reducers; ++i) {
- new (&reducers[i]) BlockReducer(m_reducer);
- }
-
- typedef internal::TensorSliceBlockMapper<ScalarNoConst, Index,
- NumInputDims, Layout>
- TensorSliceBlockMapper;
-
- // TODO(andydavis) Consider removing 'input_block_stride_order' if we
- // find that scattered reads are not worth supporting in
- // TensorSliceBlockMapper.
- TensorSliceBlockMapper block_mapper(
- typename TensorSliceBlockMapper::Dimensions(input_tensor_dims),
- tensor_slice_offsets, tensor_slice_extents,
- target_input_block_sizes, DimensionList<Index, NumInputDims>());
-
- const Index num_outputs_to_update =
- tensor_slice_extents[first_preserved_dim_input_index];
- const Index preserved_dim_vector_reducer_count =
- (inner_most_dim_preserved && packet_reductions_enabled)
- ? num_outputs_to_update / PacketSize
- : 0;
- const Index preserved_dim_vector_coeff_count =
- inner_most_dim_preserved
- ? preserved_dim_vector_reducer_count * PacketSize
- : 0;
- const Index preserved_dim_reducer_limit =
- (inner_most_dim_preserved && packet_reductions_enabled)
- ? (preserved_dim_vector_reducer_count +
- num_outputs_to_update % PacketSize)
- : num_outputs_to_update;
-
- const Index total_block_count = block_mapper.total_block_count();
- for (Index b = 0; b < total_block_count; ++b) {
- InputTensorBlock input_block =
- block_mapper.GetBlockForIndex(b, input_block_data);
- // Read.
- m_impl.block(&input_block);
-
- Index num_values_to_reduce = 1;
- for (Index i = 0; i < NumInputDims; ++i) {
- if (m_reduced[i]) {
- num_values_to_reduce *= input_block.block_sizes()[i];
- }
- }
- // Reduce.
- if (inner_most_dim_preserved) {
- const Index input_outer_dim_size =
- input_block.block_sizes().TotalSize() / num_outputs_to_update;
- for (Index input_outer_dim_index = 0;
- input_outer_dim_index < input_outer_dim_size;
- ++input_outer_dim_index) {
- const Index input_outer_dim_base =
- input_outer_dim_index * num_outputs_to_update;
- for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
- reducers[i].Reduce(input_outer_dim_base + i * PacketSize,
- PacketSize, input_block.data());
- }
- const Index scalar_reducer_base =
- input_outer_dim_base + preserved_dim_vector_coeff_count;
- for (Index i = preserved_dim_vector_reducer_count;
- i < preserved_dim_reducer_limit; ++i) {
- reducers[i].Reduce(scalar_reducer_base + i -
- preserved_dim_vector_reducer_count,
- 1, input_block.data());
- }
- }
- } else {
- for (Index i = 0; i < num_outputs_to_update; ++i) {
- reducers[i].Reduce(i * num_values_to_reduce, num_values_to_reduce,
- input_block.data());
- }
- }
- }
-
- // Finalize all reducers for this output shard.
- const Index output_base_index =
- output_outer_index * output_block_inner_dim_size +
- output_shard_index * output_shard_size;
- if (inner_most_dim_preserved) {
- EIGEN_ALIGN_MAX
- typename internal::remove_const<CoeffReturnType>::type
- values[PacketSize];
- for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
- const Index reducer_base = output_base_index + i * PacketSize;
- internal::pstore<CoeffReturnType, PacketReturnType>(
- values, reducers[i].FinalizePacket());
- for (Index j = 0; j < PacketSize; ++j) {
- output_block->data()[reducer_base + j] = values[j];
- }
- }
- const Index scalar_reducer_base =
- output_base_index + preserved_dim_vector_coeff_count;
-
- for (Index i = preserved_dim_vector_reducer_count;
- i < preserved_dim_reducer_limit; ++i) {
- output_block->data()[scalar_reducer_base + i -
- preserved_dim_vector_reducer_count] =
- reducers[i].Finalize();
- }
- } else {
- for (int i = 0; i < num_outputs_to_update; ++i) {
- output_block->data()[output_base_index + i] =
- reducers[i].Finalize();
- }
- }
-
- // Update 'tensor_slice_offsets' by num outputs for this output shard.
- tensor_slice_offsets[first_preserved_dim_input_index] +=
- num_outputs_to_update;
- }
- // Update slice offset for inner preserved dim.
- tensor_slice_offsets[first_preserved_dim_input_index] -=
- output_block_inner_dim_size;
- // Update slice offsets for remaining output dims.
- for (int i = 0; i < NumOutputDims - 1; ++i) {
- BlockIteratorState& b = block_iter_state[i];
- if (++b.output_count < b.output_size) {
- ++tensor_slice_offsets[b.input_dim];
- break;
- }
- b.output_count = 0;
- tensor_slice_offsets[b.input_dim] -= b.output_size - 1;
- }
- }
-
- // Free memory.
- m_device.deallocate(input_block_data);
- m_device.deallocate(reducers);
- }
-
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index 87072006d..ff5bfad46 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -141,7 +141,6 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
enum {
IsAligned = false,
PacketAccess = false,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = PlainObjectType::Layout,
@@ -378,7 +377,6 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
enum {
IsAligned = false,
PacketAccess = false,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorRef<Derived>::Layout,
@@ -432,7 +430,6 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
enum {
IsAligned = false,
PacketAccess = false,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index ae3ab5f81..1db5d2e5e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -115,7 +115,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = true,
BlockAccessV2 = NumDims > 0,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -248,112 +247,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
internal::kSkewedInnerDims, block_total_size_max));
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
- OutputTensorBlock* output_block) const {
- if (NumDims <= 0) return;
-
- // TODO(ezhulenev): If underlying tensor expression supports and prefers
- // block evaluation we must use it. Currently we use coeff and packet
- // access into the underlying tensor expression.
- // static const bool useBlockAccessForArgType =
- // TensorEvaluator<ArgType, Device>::BlockAccess &&
- // TensorEvaluator<ArgType, Device>::PreferBlockAccess;
-
- static const bool isColMajor =
- static_cast<int>(Layout) == static_cast<int>(ColMajor);
-
- static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
- const bool inner_dim_reversed = m_reverse[inner_dim_idx];
-
- CoeffReturnType* data = output_block->data();
- Index block_offset = 0;
-
- Index input_offset = reverseIndex(output_block->first_coeff_index());
-
- // Initialize output block iterator state. Dimension in this array are
- // always in inner_most -> outer_most order (col major layout).
- array<BlockIteratorState, NumDims> it;
- for (Index i = 0; i < NumDims; ++i) {
- const Index dim = isColMajor ? i : NumDims - 1 - i;
- it[i].size = output_block->block_sizes()[dim];
- it[i].count = 0;
- it[i].reverse = m_reverse[dim];
-
- it[i].block_stride = output_block->block_strides()[dim];
- it[i].block_span = it[i].block_stride * (it[i].size - 1);
-
- it[i].input_stride = m_strides[dim];
- it[i].input_span = it[i].input_stride * (it[i].size - 1);
-
- if (it[i].reverse) {
- it[i].input_stride = -1 * it[i].input_stride;
- it[i].input_span = -1 * it[i].input_span;
- }
- }
-
- // If multiple inner dimensions have the same reverse flag, check if we can
- // merge them into a single virtual inner dimension.
- int effective_inner_dim = 0;
- for (int i = 1; i < NumDims; ++i) {
- if (it[i].reverse != it[effective_inner_dim].reverse) break;
- if (it[i].block_stride != it[effective_inner_dim].size) break;
- if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
-
- it[i].size = it[effective_inner_dim].size * it[i].size;
-
- it[i].block_stride = 1;
- it[i].input_stride = (inner_dim_reversed ? -1 : 1);
-
- it[i].block_span = it[i].block_stride * (it[i].size - 1);
- it[i].input_span = it[i].input_stride * (it[i].size - 1);
-
- effective_inner_dim = i;
- }
-
- eigen_assert(it[effective_inner_dim].block_stride == 1);
- eigen_assert(it[effective_inner_dim].input_stride ==
- (inner_dim_reversed ? -1 : 1));
-
- const Index inner_dim_size = it[effective_inner_dim].size;
-
- while (it[NumDims - 1].count < it[NumDims - 1].size) {
- // Copy inner-most dimension data from reversed location in input.
- Index dst = block_offset;
- Index src = input_offset;
-
- // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed
- // worse results in benchmarks than a simple coefficient loop.
- if (inner_dim_reversed) {
- for (Index i = 0; i < inner_dim_size; ++i) {
- data[dst] = m_impl.coeff(src);
- ++dst;
- --src;
- }
- } else {
- for (Index i = 0; i < inner_dim_size; ++i) {
- data[dst] = m_impl.coeff(src);
- ++dst;
- ++src;
- }
- }
-
- // For the 1d tensor we need to generate only one inner-most dimension.
- if ((NumDims - effective_inner_dim) == 1) break;
-
- // Update offset.
- for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
- if (++it[i].count < it[i].size) {
- block_offset += it[i].block_stride;
- input_offset += it[i].input_stride;
- break;
- }
- if (i != NumDims - 1) it[i].count = 0;
- block_offset -= it[i].block_span;
- input_offset -= it[i].input_span;
- }
- }
- }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
@@ -535,7 +428,6 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 1e6fc93b1..d8005d604 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -99,7 +99,6 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
enum {
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index df4cd1eb3..72c43a39d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -115,7 +115,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
- BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -125,11 +124,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
- typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
- TensorBlock;
- typedef internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>
- TensorBlockReader;
-
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
@@ -249,98 +243,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
internal::kUniformAllDims, block_total_size_max));
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
- TensorBlock* output_block) const {
- if (m_impl.data() != NULL) {
- // Fast path: we have direct access to the data, so shuffle as we read.
- TensorBlockReader::Run(output_block,
- srcCoeff(output_block->first_coeff_index()),
- m_inverseShuffle,
- m_unshuffledInputStrides,
- m_impl.data());
- return;
- }
-
- // Slow path: read unshuffled block from the input and shuffle in-place.
- // Initialize input block sizes using input-to-output shuffle map.
- DSizes<Index, NumDims> input_block_sizes;
- for (Index i = 0; i < NumDims; ++i) {
- input_block_sizes[i] = output_block->block_sizes()[m_inverseShuffle[i]];
- }
-
- // Calculate input block strides.
- DSizes<Index, NumDims> input_block_strides;
- if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- input_block_strides[0] = 1;
- for (int i = 1; i < NumDims; ++i) {
- input_block_strides[i] =
- input_block_strides[i - 1] * input_block_sizes[i - 1];
- }
- } else {
- input_block_strides[NumDims - 1] = 1;
- for (int i = NumDims - 2; i >= 0; --i) {
- input_block_strides[i] =
- input_block_strides[i + 1] * input_block_sizes[i + 1];
- }
- }
- DSizes<internal::TensorIntDivisor<Index>, NumDims> fast_input_block_strides;
- for (int i = 0; i < NumDims; ++i) {
- fast_input_block_strides[i] =
- internal::TensorIntDivisor<Index>(input_block_strides[i]);
- }
-
- // Read input block.
- TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
- input_block_sizes,
- input_block_strides,
- Dimensions(m_unshuffledInputStrides),
- output_block->data());
-
- m_impl.block(&input_block);
-
- // Naive In-place shuffle: random IO but block size is O(L1 cache size).
- // TODO(andydavis) Improve the performance of this in-place shuffle.
- const Index total_size = input_block_sizes.TotalSize();
- std::vector<bool> bitmap(total_size, false);
- ScalarNoConst* data = const_cast<ScalarNoConst*>(output_block->data());
- const DSizes<Index, NumDims>& output_block_strides =
- output_block->block_strides();
- for (Index input_index = 0; input_index < total_size; ++input_index) {
- if (bitmap[input_index]) {
- // Coefficient at this index has already been shuffled.
- continue;
- }
-
- Index output_index =
- GetBlockOutputIndex(input_index, input_block_strides,
- output_block_strides, fast_input_block_strides);
- if (output_index == input_index) {
- // Coefficient already in place.
- bitmap[output_index] = true;
- continue;
- }
-
- // The following loop starts at 'input_index', and shuffles
- // coefficients into their shuffled location at 'output_index'.
- // It skips through the array shuffling coefficients by following
- // the shuffle cycle starting and ending a 'start_index'.
- ScalarNoConst evicted_value;
- ScalarNoConst shuffled_value = data[input_index];
- do {
- evicted_value = data[output_index];
- data[output_index] = shuffled_value;
- shuffled_value = evicted_value;
- bitmap[output_index] = true;
- output_index =
- GetBlockOutputIndex(output_index, input_block_strides,
- output_block_strides, fast_input_block_strides);
- } while (output_index != input_index);
-
- data[output_index] = shuffled_value;
- bitmap[output_index] = true;
- }
- }
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool root_of_expr_ast = false) const {
@@ -462,7 +364,6 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
- BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -471,11 +372,6 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
- typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
- TensorBlock;
- typedef internal::TensorBlockWriter<ScalarNoConst, Index, NumDims, Layout>
- TensorBlockWriter;
-
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
//===--------------------------------------------------------------------===//
@@ -502,15 +398,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
}
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
- const TensorBlock& block) {
- eigen_assert(this->m_impl.data() != NULL);
- TensorBlockWriter::Run(block, this->srcCoeff(block.first_coeff_index()),
- this->m_inverseShuffle,
- this->m_unshuffledInputStrides, this->m_impl.data());
- }
-
-template <typename TensorBlockV2>
+ template <typename TensorBlockV2>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
const TensorBlockDesc& desc, const TensorBlockV2& block) {
eigen_assert(this->m_impl.data() != NULL);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 061bf6bdf..8a7fcac23 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -114,7 +114,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
@@ -288,7 +287,6 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
index 676717d8d..209d6fb3b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -97,7 +97,6 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index ced963175..a4c38f118 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -183,7 +183,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
BlockAccessV2 = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp
index 0fb189e09..8d3ca84c8 100644
--- a/unsupported/test/cxx11_tensor_block_access.cpp
+++ b/unsupported/test/cxx11_tensor_block_access.cpp
@@ -46,22 +46,6 @@ static DSizes<Index, NumDims> RandomDims() {
return DSizes<Index, NumDims>(dims);
}
-/** Dummy data type to test TensorBlock copy ops. */
-struct Data {
- Data() : value(0) {}
- explicit Data(int v) : value(v) { }
- int value;
-};
-
-bool operator==(const Data& lhs, const Data& rhs) {
- return lhs.value == rhs.value;
-}
-
-std::ostream& operator<<(std::ostream& os, const Data& d) {
- os << "Data: value=" << d.value;
- return os;
-}
-
template <typename T>
static T* GenerateRandomData(const Index& size) {
T* data = new T[size];
@@ -71,15 +55,6 @@ static T* GenerateRandomData(const Index& size) {
return data;
}
-template <>
-Data* GenerateRandomData(const Index& size) {
- Data* data = new Data[size];
- for (int i = 0; i < size; ++i) {
- data[i] = Data(internal::random<int>(1, 100));
- }
- return data;
-}
-
template <int NumDims>
static void Debug(DSizes<Index, NumDims> dims) {
for (int i = 0; i < NumDims; ++i) {
@@ -183,84 +158,6 @@ static void test_block_mapper_maps_every_element() {
VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1);
}
-template <typename T, int NumDims, int Layout>
-static void test_slice_block_mapper_maps_every_element() {
- typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
- typedef internal::TensorSliceBlockMapper<T, Index, NumDims, Layout> TensorSliceBlockMapper;
-
- DSizes<Index, NumDims> tensor_dims = RandomDims<NumDims>();
- DSizes<Index, NumDims> tensor_slice_offsets = RandomDims<NumDims>();
- DSizes<Index, NumDims> tensor_slice_extents = RandomDims<NumDims>();
-
- // Make sure that tensor offsets + extents do not overflow.
- for (int i = 0; i < NumDims; ++i) {
- tensor_slice_offsets[i] =
- numext::mini(tensor_dims[i] - 1, tensor_slice_offsets[i]);
- tensor_slice_extents[i] = numext::mini(
- tensor_slice_extents[i], tensor_dims[i] - tensor_slice_offsets[i]);
- }
-
- // Keep track of elements indices available via block access.
- std::set<Index> coeff_set;
-
- int total_coeffs = static_cast<int>(tensor_slice_extents.TotalSize());
-
- // Pick a random dimension sizes for the tensor blocks.
- DSizes<Index, NumDims> block_sizes;
- for (int i = 0; i < NumDims; ++i) {
- block_sizes[i] = internal::random<Index>(1, tensor_slice_extents[i]);
- }
-
- TensorSliceBlockMapper block_mapper(tensor_dims, tensor_slice_offsets,
- tensor_slice_extents, block_sizes,
- DimensionList<Index, NumDims>());
-
- for (int i = 0; i < block_mapper.total_block_count(); ++i) {
- TensorBlock block = block_mapper.GetBlockForIndex(i, NULL);
- UpdateCoeffSet<T, Layout, NumDims>(block, block.first_coeff_index(),
- choose(Layout, NumDims - 1, 0),
- &coeff_set);
- }
-
- VERIFY_IS_EQUAL(Index(coeff_set.size()), total_coeffs);
-}
-
-template <typename T, int NumDims, int Layout>
-static void test_block_io_copy_data_from_source_to_target() {
- typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
- typedef internal::TensorBlockMapper<T, Index, NumDims, Layout>
- TensorBlockMapper;
-
- typedef internal::TensorBlockReader<T, Index, NumDims, Layout>
- TensorBlockReader;
- typedef internal::TensorBlockWriter<T, Index, NumDims, Layout>
- TensorBlockWriter;
-
- DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
- const Index input_tensor_size = input_tensor_dims.TotalSize();
-
- T* input_data = GenerateRandomData<T>(input_tensor_size);
- T* output_data = new T[input_tensor_size];
-
- TensorBlockMapper block_mapper(input_tensor_dims, RandomShape(),
- RandomTargetSize(input_tensor_dims));
- T* block_data = new T[block_mapper.block_dims_total_size()];
-
- for (int i = 0; i < block_mapper.total_block_count(); ++i) {
- TensorBlock block = block_mapper.GetBlockForIndex(i, block_data);
- TensorBlockReader::Run(&block, input_data);
- TensorBlockWriter::Run(block, output_data);
- }
-
- for (int i = 0; i < input_tensor_size; ++i) {
- VERIFY_IS_EQUAL(input_data[i], output_data[i]);
- }
-
- delete[] input_data;
- delete[] output_data;
- delete[] block_data;
-}
-
template <int Layout, int NumDims>
static Index GetInputIndex(Index output_index,
const array<Index, NumDims>& output_to_input_dim_map,
@@ -304,179 +201,6 @@ static array<Index, NumDims> ComputeStrides(
return strides;
}
-template <typename T, int NumDims, int Layout>
-static void test_block_io_copy_using_reordered_dimensions() {
- typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
- typedef internal::TensorBlockMapper<T, Index, NumDims, Layout>
- TensorBlockMapper;
-
- typedef internal::TensorBlockReader<T, Index, NumDims, Layout>
- TensorBlockReader;
- typedef internal::TensorBlockWriter<T, Index, NumDims, Layout>
- TensorBlockWriter;
-
- DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
- const Index input_tensor_size = input_tensor_dims.TotalSize();
-
- // Create a random input tensor.
- T* input_data = GenerateRandomData<T>(input_tensor_size);
-
- // Create a random dimension re-ordering/shuffle.
- std::vector<Index> shuffle;
- for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
- std::random_shuffle(shuffle.begin(), shuffle.end());
-
- DSizes<Index, NumDims> output_tensor_dims;
- array<Index, NumDims> input_to_output_dim_map;
- array<Index, NumDims> output_to_input_dim_map;
- for (Index i = 0; i < NumDims; ++i) {
- output_tensor_dims[shuffle[i]] = input_tensor_dims[i];
- input_to_output_dim_map[i] = shuffle[i];
- output_to_input_dim_map[shuffle[i]] = i;
- }
-
- // Random block shape and size.
- TensorBlockMapper block_mapper(output_tensor_dims, RandomShape(),
- RandomTargetSize(input_tensor_dims));
-
- T* block_data = new T[block_mapper.block_dims_total_size()];
- T* output_data = new T[input_tensor_size];
-
- array<Index, NumDims> input_tensor_strides =
- ComputeStrides<Layout, NumDims>(input_tensor_dims);
- array<Index, NumDims> output_tensor_strides =
- ComputeStrides<Layout, NumDims>(output_tensor_dims);
-
- for (Index i = 0; i < block_mapper.total_block_count(); ++i) {
- TensorBlock block = block_mapper.GetBlockForIndex(i, block_data);
- const Index first_coeff_index = GetInputIndex<Layout, NumDims>(
- block.first_coeff_index(), output_to_input_dim_map,
- input_tensor_strides, output_tensor_strides);
- TensorBlockReader::Run(&block, first_coeff_index, input_to_output_dim_map,
- input_tensor_strides, input_data);
- TensorBlockWriter::Run(block, first_coeff_index, input_to_output_dim_map,
- input_tensor_strides, output_data);
- }
-
- for (int i = 0; i < input_tensor_size; ++i) {
- VERIFY_IS_EQUAL(input_data[i], output_data[i]);
- }
-
- delete[] input_data;
- delete[] block_data;
- delete[] output_data;
-}
-
-// This is the special case for reading data with reordering, when dimensions
-// before/after reordering are the same. Squeezing reads along inner dimensions
-// in this case is illegal, because we reorder innermost dimension.
-template <int Layout>
-static void test_block_io_copy_using_reordered_dimensions_do_not_squeeze()
-{
- typedef internal::TensorBlock<float, Index, 3, Layout> TensorBlock;
- typedef internal::TensorBlockReader<float, Index, 3, Layout>
- TensorBlockReader;
-
- DSizes<Index, 3> tensor_dims;
- tensor_dims[0] = 7;
- tensor_dims[1] = 9;
- tensor_dims[2] = 7;
-
- DSizes<Index, 3> block_dims = tensor_dims;
-
- DSizes<Index, 3> tensor_to_block_dim_map;
- tensor_to_block_dim_map[0] = 2;
- tensor_to_block_dim_map[1] = 1;
- tensor_to_block_dim_map[2] = 0;
-
- DSizes<Index, 3> tensor_strides(ComputeStrides<Layout, 3>(tensor_dims));
- DSizes<Index, 3> block_strides(ComputeStrides<Layout, 3>(block_dims));
-
- const Index tensor_size = tensor_dims.TotalSize();
- float* tensor_data = GenerateRandomData<float>(tensor_size);
- float* block_data = new float[tensor_size];
-
- TensorBlock block(0, block_dims, block_strides, tensor_strides, block_data);
- TensorBlockReader::Run(&block,
- 0,
- tensor_to_block_dim_map,
- tensor_strides,
- tensor_data);
-
- TensorMap<Tensor<float, 3, Layout> > block_tensor(block_data, block_dims);
- TensorMap<Tensor<float, 3, Layout> > tensor_tensor(tensor_data, tensor_dims);
-
- for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
- for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
- for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
- float block_value = block_tensor(d2, d1, d0);
- float tensor_value = tensor_tensor(d0, d1, d2);
- VERIFY_IS_EQUAL(block_value, tensor_value);
- }
- }
- }
-
- delete[] block_data;
- delete[] tensor_data;
-}
-
-// This is the special case for reading data with reordering, when dimensions
-// before/after reordering are the same. Squeezing reads in this case is allowed
-// because we reorder outer dimensions.
-template <int Layout>
-static void test_block_io_copy_using_reordered_dimensions_squeeze()
-{
- typedef internal::TensorBlock<float, Index, 4, Layout> TensorBlock;
- typedef internal::TensorBlockReader<float, Index, 4, Layout>
- TensorBlockReader;
-
- DSizes<Index, 4> tensor_dims;
- tensor_dims[0] = 7;
- tensor_dims[1] = 5;
- tensor_dims[2] = 9;
- tensor_dims[3] = 9;
-
- DSizes<Index, 4> block_dims = tensor_dims;
-
- DSizes<Index, 4> tensor_to_block_dim_map;
- tensor_to_block_dim_map[0] = 0;
- tensor_to_block_dim_map[1] = 1;
- tensor_to_block_dim_map[2] = 3;
- tensor_to_block_dim_map[3] = 2;
-
- DSizes<Index, 4> tensor_strides(ComputeStrides<Layout, 4>(tensor_dims));
- DSizes<Index, 4> block_strides(ComputeStrides<Layout, 4>(block_dims));
-
- const Index tensor_size = tensor_dims.TotalSize();
- float* tensor_data = GenerateRandomData<float>(tensor_size);
- float* block_data = new float[tensor_size];
-
- TensorBlock block(0, block_dims, block_strides, tensor_strides, block_data);
- TensorBlockReader::Run(&block,
- 0,
- tensor_to_block_dim_map,
- tensor_strides,
- tensor_data);
-
- TensorMap<Tensor<float, 4, Layout> > block_tensor(block_data, block_dims);
- TensorMap<Tensor<float, 4, Layout> > tensor_tensor(tensor_data, tensor_dims);
-
- for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
- for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
- for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
- for (Index d3 = 0; d3 < tensor_dims[3]; ++d3) {
- float block_value = block_tensor(d0, d1, d3, d2);
- float tensor_value = tensor_tensor(d0, d1, d2, d3);
- VERIFY_IS_EQUAL(block_value, tensor_value);
- }
- }
- }
- }
-
- delete[] block_data;
- delete[] tensor_data;
-}
-
template<typename Scalar, typename StorageIndex, int Dim>
class EqualityChecker
{
@@ -511,365 +235,6 @@ public:
};
template <int Layout>
-static void test_block_io_zero_stride()
-{
- typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
- typedef internal::TensorBlockReader<float, Index, 5, Layout>
- TensorBlockReader;
- typedef internal::TensorBlockWriter<float, Index, 5, Layout>
- TensorBlockWriter;
-
- DSizes<Index, 5> rnd_dims = RandomDims<5>();
-
- DSizes<Index, 5> input_tensor_dims = rnd_dims;
- input_tensor_dims[0] = 1;
- input_tensor_dims[2] = 1;
- input_tensor_dims[4] = 1;
- const Index input_tensor_size = input_tensor_dims.TotalSize();
- float* input_data = GenerateRandomData<float>(input_tensor_size);
-
- DSizes<Index, 5> output_tensor_dims = rnd_dims;
-
- DSizes<Index, 5> input_tensor_strides(
- ComputeStrides<Layout, 5>(input_tensor_dims));
- DSizes<Index, 5> output_tensor_strides(
- ComputeStrides<Layout, 5>(output_tensor_dims));
-
- DSizes<Index, 5> input_tensor_strides_with_zeros(input_tensor_strides);
- input_tensor_strides_with_zeros[0] = 0;
- input_tensor_strides_with_zeros[2] = 0;
- input_tensor_strides_with_zeros[4] = 0;
-
- // Verify that data was correctly read/written from/into the block.
- const EqualityChecker<float, Index, 5> verify_is_equal(input_data, input_tensor_dims, input_tensor_strides, output_tensor_dims, output_tensor_strides);
-
- {
- float* output_data = new float[output_tensor_dims.TotalSize()];
- TensorBlock read_block(0, output_tensor_dims, output_tensor_strides,
- input_tensor_strides_with_zeros, output_data);
- TensorBlockReader::Run(&read_block, input_data);
- verify_is_equal(output_data);
- delete[] output_data;
- }
-
- {
- float* output_data = new float[output_tensor_dims.TotalSize()];
- TensorBlock write_block(0, output_tensor_dims,
- input_tensor_strides_with_zeros,
- output_tensor_strides, input_data);
- TensorBlockWriter::Run(write_block, output_data);
- verify_is_equal(output_data);
- delete[] output_data;
- }
-
- delete[] input_data;
-}
-
-template <int Layout>
-static void test_block_io_squeeze_ones() {
- typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
- typedef internal::TensorBlockReader<float, Index, 5, Layout>
- TensorBlockReader;
- typedef internal::TensorBlockWriter<float, Index, 5, Layout>
- TensorBlockWriter;
-
- // Total size > 1.
- {
- DSizes<Index, 5> block_sizes(1, 2, 1, 2, 1);
- const Index total_size = block_sizes.TotalSize();
-
- // Create a random input tensor.
- float* input_data = GenerateRandomData<float>(total_size);
- DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
-
- {
- float* output_data = new float[block_sizes.TotalSize()];
- TensorBlock read_block(0, block_sizes, strides, strides, output_data);
- TensorBlockReader::Run(&read_block, input_data);
- for (int i = 0; i < total_size; ++i) {
- VERIFY_IS_EQUAL(output_data[i], input_data[i]);
- }
- delete[] output_data;
- }
-
- {
- float* output_data = new float[block_sizes.TotalSize()];
- TensorBlock write_block(0, block_sizes, strides, strides, input_data);
- TensorBlockWriter::Run(write_block, output_data);
- for (int i = 0; i < total_size; ++i) {
- VERIFY_IS_EQUAL(output_data[i], input_data[i]);
- }
- delete[] output_data;
- }
- }
-
- // Total size == 1.
- {
- DSizes<Index, 5> block_sizes(1, 1, 1, 1, 1);
- const Index total_size = block_sizes.TotalSize();
-
- // Create a random input tensor.
- float* input_data = GenerateRandomData<float>(total_size);
- DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
-
- {
- float* output_data = new float[block_sizes.TotalSize()];
- TensorBlock read_block(0, block_sizes, strides, strides, output_data);
- TensorBlockReader::Run(&read_block, input_data);
- for (int i = 0; i < total_size; ++i) {
- VERIFY_IS_EQUAL(output_data[i], input_data[i]);
- }
- delete[] output_data;
- }
-
- {
- float* output_data = new float[block_sizes.TotalSize()];
- TensorBlock write_block(0, block_sizes, strides, strides, input_data);
- TensorBlockWriter::Run(write_block, output_data);
- for (int i = 0; i < total_size; ++i) {
- VERIFY_IS_EQUAL(output_data[i], input_data[i]);
- }
- delete[] output_data;
- }
- }
-}
-
-template <typename T, int NumDims, int Layout>
-static void test_block_cwise_unary_io_basic() {
- typedef internal::scalar_square_op<T> UnaryFunctor;
- typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, T, NumDims,
- Layout>
- TensorBlockCwiseUnaryIO;
-
- DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>();
- DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes));
-
- const Index total_size = block_sizes.TotalSize();
-
- // Create a random input tensors.
- T* input_data = GenerateRandomData<T>(total_size);
-
- T* output_data = new T[total_size];
- UnaryFunctor functor;
- TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data,
- strides, input_data);
- for (int i = 0; i < total_size; ++i) {
- VERIFY_IS_EQUAL(output_data[i], functor(input_data[i]));
- }
-
- delete[] input_data;
- delete[] output_data;
-}
-
-template <int Layout>
-static void test_block_cwise_unary_io_squeeze_ones() {
- typedef internal::scalar_square_op<float> UnaryFunctor;
- typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, float, 5,
- Layout>
- TensorBlockCwiseUnaryIO;
-
- DSizes<Index, 5> block_sizes(1, 2, 1, 3, 1);
- DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
-
- const Index total_size = block_sizes.TotalSize();
-
- // Create a random input tensors.
- float* input_data = GenerateRandomData<float>(total_size);
-
- float* output_data = new float[total_size];
- UnaryFunctor functor;
- TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data,
- strides, input_data);
- for (int i = 0; i < total_size; ++i) {
- VERIFY_IS_EQUAL(output_data[i], functor(input_data[i]));
- }
-
- delete[] input_data;
- delete[] output_data;
-}
-
-template <int Layout>
-static void test_block_cwise_unary_io_zero_strides() {
- typedef internal::scalar_square_op<float> UnaryFunctor;
- typedef internal::TensorBlockCwiseUnaryIO<UnaryFunctor, Index, float, 5,
- Layout>
- TensorBlockCwiseUnaryIO;
-
- DSizes<Index, 5> rnd_dims = RandomDims<5>();
-
- DSizes<Index, 5> input_sizes = rnd_dims;
- input_sizes[0] = 1;
- input_sizes[2] = 1;
- input_sizes[4] = 1;
-
- DSizes<Index, 5> input_strides(ComputeStrides<Layout, 5>(input_sizes));
- input_strides[0] = 0;
- input_strides[2] = 0;
- input_strides[4] = 0;
-
- // Generate random data.
- float* input_data = GenerateRandomData<float>(input_sizes.TotalSize());
-
- DSizes<Index, 5> output_sizes = rnd_dims;
- DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));
-
- const Index output_total_size = output_sizes.TotalSize();
- float* output_data = new float[output_total_size];
-
- UnaryFunctor functor;
- TensorBlockCwiseUnaryIO::Run(functor, output_sizes, output_strides,
- output_data, input_strides, input_data);
- for (int i = 0; i < rnd_dims[0]; ++i) {
- for (int j = 0; j < rnd_dims[1]; ++j) {
- for (int k = 0; k < rnd_dims[2]; ++k) {
- for (int l = 0; l < rnd_dims[3]; ++l) {
- for (int m = 0; m < rnd_dims[4]; ++m) {
- Index output_index = i * output_strides[0] + j * output_strides[1] +
- k * output_strides[2] + l * output_strides[3] +
- m * output_strides[4];
- Index input_index = i * input_strides[0] + j * input_strides[1] +
- k * input_strides[2] + l * input_strides[3] +
- m * input_strides[4];
- VERIFY_IS_EQUAL(output_data[output_index],
- functor(input_data[input_index]));
- }
- }
- }
- }
- }
-
- delete[] input_data;
- delete[] output_data;
-}
-
-template <typename T, int NumDims, int Layout>
-static void test_block_cwise_binary_io_basic() {
- typedef internal::scalar_sum_op<T> BinaryFunctor;
- typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, T, NumDims,
- Layout>
- TensorBlockCwiseBinaryIO;
-
- DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>();
- DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes));
-
- const Index total_size = block_sizes.TotalSize();
-
- // Create a random input tensors.
- T* left_data = GenerateRandomData<T>(total_size);
- T* right_data = GenerateRandomData<T>(total_size);
-
- T* output_data = new T[total_size];
- BinaryFunctor functor;
- TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data,
- strides, left_data, strides, right_data);
- for (int i = 0; i < total_size; ++i) {
- VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i]));
- }
-
- delete[] left_data;
- delete[] right_data;
- delete[] output_data;
-}
-
-template <int Layout>
-static void test_block_cwise_binary_io_squeeze_ones() {
- typedef internal::scalar_sum_op<float> BinaryFunctor;
- typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5,
- Layout>
- TensorBlockCwiseBinaryIO;
-
- DSizes<Index, 5> block_sizes(1, 2, 1, 3, 1);
- DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
-
- const Index total_size = block_sizes.TotalSize();
-
- // Create a random input tensors.
- float* left_data = GenerateRandomData<float>(total_size);
- float* right_data = GenerateRandomData<float>(total_size);
-
- float* output_data = new float[total_size];
- BinaryFunctor functor;
- TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data,
- strides, left_data, strides, right_data);
- for (int i = 0; i < total_size; ++i) {
- VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i]));
- }
-
- delete[] left_data;
- delete[] right_data;
- delete[] output_data;
-}
-
-template <int Layout>
-static void test_block_cwise_binary_io_zero_strides() {
- typedef internal::scalar_sum_op<float> BinaryFunctor;
- typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5,
- Layout>
- TensorBlockCwiseBinaryIO;
-
- DSizes<Index, 5> rnd_dims = RandomDims<5>();
-
- DSizes<Index, 5> left_sizes = rnd_dims;
- left_sizes[0] = 1;
- left_sizes[2] = 1;
- left_sizes[4] = 1;
-
- DSizes<Index, 5> left_strides(ComputeStrides<Layout, 5>(left_sizes));
- left_strides[0] = 0;
- left_strides[2] = 0;
- left_strides[4] = 0;
-
- DSizes<Index, 5> right_sizes = rnd_dims;
- right_sizes[1] = 1;
- right_sizes[3] = 1;
-
- DSizes<Index, 5> right_strides(ComputeStrides<Layout, 5>(right_sizes));
- right_strides[1] = 0;
- right_strides[3] = 0;
-
- // Generate random data.
- float* left_data = GenerateRandomData<float>(left_sizes.TotalSize());
- float* right_data = GenerateRandomData<float>(right_sizes.TotalSize());
-
- DSizes<Index, 5> output_sizes = rnd_dims;
- DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));
-
- const Index output_total_size = output_sizes.TotalSize();
- float* output_data = new float[output_total_size];
-
- BinaryFunctor functor;
- TensorBlockCwiseBinaryIO::Run(functor, output_sizes, output_strides,
- output_data, left_strides, left_data,
- right_strides, right_data);
- for (int i = 0; i < rnd_dims[0]; ++i) {
- for (int j = 0; j < rnd_dims[1]; ++j) {
- for (int k = 0; k < rnd_dims[2]; ++k) {
- for (int l = 0; l < rnd_dims[3]; ++l) {
- for (int m = 0; m < rnd_dims[4]; ++m) {
- Index output_index = i * output_strides[0] + j * output_strides[1] +
- k * output_strides[2] + l * output_strides[3] +
- m * output_strides[4];
- Index left_index = i * left_strides[0] + j * left_strides[1] +
- k * left_strides[2] + l * left_strides[3] +
- m * left_strides[4];
- Index right_index = i * right_strides[0] + j * right_strides[1] +
- k * right_strides[2] + l * right_strides[3] +
- m * right_strides[4];
- VERIFY_IS_EQUAL(
- output_data[output_index],
- functor(left_data[left_index], right_data[right_index]));
- }
- }
- }
- }
- }
-
- delete[] left_data;
- delete[] right_data;
- delete[] output_data;
-}
-
-template <int Layout>
static void test_uniform_block_shape()
{
typedef internal::TensorBlock<int, Index, 5, Layout> TensorBlock;
@@ -1196,21 +561,6 @@ static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
TEST_LAYOUTS(test_block_mapper_sanity);
TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element);
- TEST_LAYOUTS_AND_DIMS(float, test_slice_block_mapper_maps_every_element);
- TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_data_from_source_to_target);
- TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_data_from_source_to_target);
- TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_using_reordered_dimensions);
- TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_using_reordered_dimensions);
- TEST_LAYOUTS(test_block_io_copy_using_reordered_dimensions_do_not_squeeze);
- TEST_LAYOUTS(test_block_io_copy_using_reordered_dimensions_squeeze);
- TEST_LAYOUTS(test_block_io_zero_stride);
- TEST_LAYOUTS(test_block_io_squeeze_ones);
- TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_unary_io_basic);
- TEST_LAYOUTS(test_block_cwise_unary_io_squeeze_ones);
- TEST_LAYOUTS(test_block_cwise_unary_io_zero_strides);
- TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_binary_io_basic);
- TEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones);
- TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides);
TEST_LAYOUTS(test_uniform_block_shape);
TEST_LAYOUTS(test_skewed_inner_dim_block_shape);
TEST_LAYOUTS_WITH_ARG(test_empty_dims, internal::kUniformAllDims);
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index 0e70e1770..66b06e8ee 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -311,48 +311,6 @@ static void test_execute_shuffle_lvalue(Device d)
}
template <typename T, int NumDims, typename Device, bool Vectorizable,
- TiledEvaluation Tiling, int Layout>
-static void test_execute_reduction(Device d)
-{
- static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
-
- static constexpr int ReducedDims = NumDims - 2;
- static constexpr int Options = 0 | Layout;
-
- auto dims = RandomDims<NumDims>(5, 10);
- Tensor<T, NumDims, Options, Index> src(dims);
- src.setRandom();
-
- // Pick two random and unique reduction dimensions.
- int reduction0 = internal::random<int>(0, NumDims - 1);
- int reduction1 = internal::random<int>(0, NumDims - 1);
- while (reduction0 == reduction1) {
- reduction1 = internal::random<int>(0, NumDims - 1);
- }
-
- DSizes<Index, 2> reduction_axis;
- reduction_axis[0] = reduction0;
- reduction_axis[1] = reduction1;
-
- Tensor<T, ReducedDims, Options, Index> golden = src.sum(reduction_axis);
-
- // Now do the reduction using configured tensor executor.
- Tensor<T, ReducedDims, Options, Index> dst(golden.dimensions());
-
- auto expr = src.sum(reduction_axis);
-
- using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
- using Executor =
- internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
-
- Executor::run(Assign(dst, expr), d);
-
- for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
- VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
- }
-}
-
-template <typename T, int NumDims, typename Device, bool Vectorizable,
TiledEvaluation Tiling, int Layout>
static void test_execute_reshape(Device d)
{
@@ -663,57 +621,34 @@ static void test_async_execute_binary_expr(Device d)
#define CALL_SUBTEST_PART(PART) \
CALL_SUBTEST_##PART
-#define CALL_SUBTEST_COMBINATIONS_V1(PART, NAME, T, NUM_DIMS) \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, ColMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Legacy, ColMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, RowMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Legacy, RowMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, ColMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, RowMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(tp_device)))
-
- // NOTE: Tiling V2 currently implemented for a limited types of expression, and only with default device.
-#define CALL_SUBTEST_COMBINATIONS_V2(PART, NAME, T, NUM_DIMS) \
+#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, ColMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Legacy, ColMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, ColMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, RowMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Legacy, RowMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, RowMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(default_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(default_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, RowMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(tp_device)))
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(tp_device)))
// NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Legacy, RowMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, RowMajor>(tp_device))); \
CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
- CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Legacy, RowMajor>(tp_device)))
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(tp_device)))
EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
Eigen::DefaultDevice default_device;
@@ -724,69 +659,64 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
Eigen::ThreadPool tp(num_threads);
Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
- CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 3);
- CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 4);
- CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 5);
-
- CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 3);
- CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 4);
- CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 5);
-
- CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 3);
- CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 4);
- CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 5);
-
- CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 3);
- CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 4);
- CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 5);
-
- CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 3);
- CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 4);
- CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 5);
-
- CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 3);
- CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 4);
- CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 5);
-
- CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 3);
- CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 4);
- CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 5);
-
- CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 2);
- CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 3);
- CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 4);
- CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 5);
-
- CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 2);
- CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 3);
- CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 4);
- CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 5);
-
- CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 2);
- CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 3);
- CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 4);
- CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 5);
-
- CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 2);
- CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 3);
- CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 4);
- CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 5);
-
- CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 2);
- CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 3);
- CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 4);
- CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 5);
-
- CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 2);
- CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 3);
- CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 4);
- CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 5);
-
- CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 1);
- CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 2);
- CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 3);
- CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 4);
- CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 5);
+ CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3);
+ CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4);
+ CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3);
+ CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4);
+ CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3);
+ CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4);
+ CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2);
+ CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3);
+ CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4);
+ CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2);
+ CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2);
+ CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2);
+ CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3);
+ CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4);
+ CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2);
+ CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3);
+ CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
+ CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3);
CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4);