From 13c3327f5cf829fd9d04a2ab46861e722cd74ca0 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 12 Nov 2019 10:12:28 -0800 Subject: Remove legacy block evaluation support --- unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 2 - .../Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h | 1 - unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 15 - unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 907 --------------------- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 274 ------- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 68 -- .../Eigen/CXX11/src/Tensor/TensorConcatenation.h | 2 - .../Eigen/CXX11/src/Tensor/TensorContraction.h | 1 - .../Eigen/CXX11/src/Tensor/TensorConversion.h | 1 - .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 2 - .../Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h | 1 - .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 2 - unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 16 - .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 72 -- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 202 ----- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 1 - .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 1 - .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 11 - .../CXX11/src/Tensor/TensorForwardDeclarations.h | 9 +- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 55 -- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 134 --- .../Eigen/CXX11/src/Tensor/TensorInflation.h | 1 - .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 2 - .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 178 +--- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 1 - unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 1 - .../Eigen/CXX11/src/Tensor/TensorReduction.h | 258 ------ unsupported/Eigen/CXX11/src/Tensor/TensorRef.h | 3 - unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 108 --- unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 1 - .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 114 +-- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 2 - unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h | 1 - .../Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 1 - unsupported/test/cxx11_tensor_block_access.cpp | 650 --------------- unsupported/test/cxx11_tensor_executor.cpp | 212 ++--- 36 files changed, 74 insertions(+), 3236 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index f2a5d86fe..68bfd141a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -88,7 +88,6 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = /*TensorEvaluator::PacketAccess*/ false, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, @@ -230,7 +229,6 @@ struct TensorEvaluator, Devi enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = /*TensorEvaluator::PacketAccess*/ false, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator >, Device>::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h index e6d8e7f91..2184c94b3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h @@ -108,7 +108,6 @@ struct TensorEvaluator, Sy enum { IsAligned = false, PacketAccess = false, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 1f64de3a9..d7795a00d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -108,8 +108,6 @@ struct TensorEvaluator, Device> TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess & - TensorEvaluator::BlockAccess, BlockAccessV2 = TensorEvaluator::BlockAccessV2 & TensorEvaluator::BlockAccessV2, PreferBlockAccess = TensorEvaluator::PreferBlockAccess | @@ -216,19 +214,6 @@ struct TensorEvaluator, Device> m_rightImpl.getResourceRequirements(resources); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) { - if (TensorEvaluator::RawAccess && - m_leftImpl.data() != NULL) { - TensorBlock left_block(block->first_coeff_index(), block->block_sizes(), - block->tensor_strides(), block->tensor_strides(), - m_leftImpl.data() + block->first_coeff_index()); - m_rightImpl.block(&left_block); - } else { - m_rightImpl.block(block); - m_leftImpl.writeBlock(*block); - } - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2( TensorBlockDesc& desc, TensorBlockScratch& scratch) { if (TensorEvaluator::RawAccess && diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index a8e7a8d7b..447da9121 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -142,782 +142,6 @@ class TensorBlock { Scalar* m_data; // Not owned. }; -template -struct TensorBlockCopyOp { - - typedef typename packet_traits::type Packet; - enum { - Vectorizable = packet_traits::Vectorizable, - PacketSize = packet_traits::size - }; - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const StorageIndex num_coeff_to_copy, const StorageIndex dst_index, - const StorageIndex dst_stride, Scalar* EIGEN_RESTRICT dst_data, - const StorageIndex src_index, const StorageIndex src_stride, - const Scalar* EIGEN_RESTRICT src_data) { - const Scalar* src = &src_data[src_index]; - Scalar* dst = &dst_data[dst_index]; - - if (!Vectorizable) { - for (Index i = 0; i < num_coeff_to_copy; ++i) { - dst[i * dst_stride] = src[i * src_stride]; - } - return; - } - - if (src_stride == 1) { - const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; - if (dst_stride == 1) { - // LINEAR - for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) { - Packet p = ploadu(src + i); - pstoreu(dst + i, p); - } - for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) { - dst[i] = src[i]; - } - } else { - // SCATTER - for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) { - Packet p = ploadu(src + i); - pscatter(dst + i * dst_stride, p, dst_stride); - } - for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) { - dst[i * dst_stride] = src[i]; - } - } - } else if (src_stride == 0) { - const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; - if (dst_stride == 1) { - // LINEAR - for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) { - Packet p = pload1(src); - pstoreu(dst + i, p); - } - for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) { - dst[i] = *src; - } - } else { - // SCATTER - for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) { - Packet p = pload1(src); - pscatter(dst + i * dst_stride, p, dst_stride); - } - for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) { - dst[i * dst_stride] = *src; - } - } - } else { - if (dst_stride == 1) { - // GATHER - const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; - for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) { - Packet p = pgather(src + i * src_stride, src_stride); - pstoreu(dst + i, p); - } - for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) { - dst[i] = src[i * src_stride]; - } - } else { - // RANDOM - for (StorageIndex i = 0; i < num_coeff_to_copy; ++i) { - dst[i * dst_stride] = src[i * src_stride]; - } - } - } - } -}; - -/** - * \class TensorBlockIO - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor block IO class. - * - * This class is responsible for copying data between a tensor and a tensor - * block. - */ -template -class TensorBlockIO { - public: - typedef TensorBlock Block; - typedef TensorBlockCopyOp BlockCopyOp; - - protected: - typedef array Dimensions; - - struct BlockIteratorState { - StorageIndex input_stride; - StorageIndex output_stride; - StorageIndex input_span; - StorageIndex output_span; - StorageIndex size; - StorageIndex count; - BlockIteratorState() - : input_stride(0), - output_stride(0), - input_span(0), - output_span(0), - size(0), - count(0) {} - }; - - // Compute how many inner dimensions it's allowed to squeeze when doing IO - // between a tensor and a block. It's safe to squeeze inner dimensions, only - // if they are not reordered. - static int NumSqueezableInnerDims(const Dimensions& tensor_to_block_dim_map) { - int num_squeezable_dims = 0; - if (Layout == ColMajor) { - for (int i = 0; i < NumDims; ++i) { - if (tensor_to_block_dim_map[i] == i) num_squeezable_dims++; - else break; - } - } else { - for (int i = NumDims - 1; i >= 0; --i) { - if (tensor_to_block_dim_map[i] == i) num_squeezable_dims++; - else break; - } - } - return num_squeezable_dims; - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy( - const Block& block, StorageIndex first_coeff_index, - const Dimensions& tensor_to_block_dim_map, - const Dimensions& tensor_strides, - const Scalar* src_data, - Scalar* dst_data) { - // Do not squeeze reordered inner dimensions. - int num_squeezable_dims = NumSqueezableInnerDims(tensor_to_block_dim_map); - - // Find the innermost tensor dimension whose size is not 1. This is the - // effective inner dim. If all dimensions are of size 1, then fallback to - // using the actual innermost dim to avoid out-of-bound access. - StorageIndex num_size_one_inner_dims = 0; - for (int i = 0; i < num_squeezable_dims; ++i) { - const int dim = cond()(i, NumDims - i - 1); - if (block.block_sizes()[tensor_to_block_dim_map[dim]] != 1) { - num_size_one_inner_dims = i; - break; - } - } - - // Calculate strides and dimensions. - const StorageIndex tensor_stride1_dim = cond()( - num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1); - const StorageIndex block_dim_for_tensor_stride1_dim = - NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim]; - StorageIndex block_inner_dim_size = - NumDims == 0 ? 1 - : block.block_sizes()[block_dim_for_tensor_stride1_dim]; - - // Squeeze multiple inner dims into one for larger inner dim size. - for (Index i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) { - const Index dim = cond()(i, NumDims - i - 1); - const StorageIndex block_stride = - block.block_strides()[tensor_to_block_dim_map[dim]]; - if (block_inner_dim_size == block_stride && - block_stride == tensor_strides[dim]) { - block_inner_dim_size *= - block.block_sizes()[tensor_to_block_dim_map[dim]]; - ++num_size_one_inner_dims; - } else { - break; - } - } - - StorageIndex inputIndex; - StorageIndex outputIndex; - StorageIndex input_stride; - StorageIndex output_stride; - - // Setup strides to read/write along the tensor's stride1 dimension. - if (BlockRead) { - inputIndex = first_coeff_index; - outputIndex = 0; - input_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim]; - output_stride = - NumDims == 0 - ? 1 - : block.block_strides()[block_dim_for_tensor_stride1_dim]; - } else { - inputIndex = 0; - outputIndex = first_coeff_index; - input_stride = - NumDims == 0 - ? 1 - : block.block_strides()[block_dim_for_tensor_stride1_dim]; - output_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim]; - } - - const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; - array block_iter_state; - - // Initialize block iterator state. Squeeze away any dimension of size 1. - Index num_squeezed_dims = 0; - for (Index i = num_size_one_inner_dims; i < NumDims - 1; ++i) { - const Index dim = cond()(i + 1, NumDims - i - 2); - const StorageIndex size = block.block_sizes()[tensor_to_block_dim_map[dim]]; - if (size == 1) { - continue; - } - block_iter_state[num_squeezed_dims].size = size; - if (BlockRead) { - block_iter_state[num_squeezed_dims].input_stride = tensor_strides[dim]; - block_iter_state[num_squeezed_dims].output_stride = - block.block_strides()[tensor_to_block_dim_map[dim]]; - } else { - block_iter_state[num_squeezed_dims].input_stride = - block.block_strides()[tensor_to_block_dim_map[dim]]; - block_iter_state[num_squeezed_dims].output_stride = tensor_strides[dim]; - } - block_iter_state[num_squeezed_dims].input_span = - block_iter_state[num_squeezed_dims].input_stride * - (block_iter_state[num_squeezed_dims].size - 1); - block_iter_state[num_squeezed_dims].output_span = - block_iter_state[num_squeezed_dims].output_stride * - (block_iter_state[num_squeezed_dims].size - 1); - ++num_squeezed_dims; - } - - // Iterate copying data from src to dst. - const StorageIndex block_total_size = - NumDims == 0 ? 1 : block.block_sizes().TotalSize(); - for (StorageIndex i = 0; i < block_total_size; i += block_inner_dim_size) { - BlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride, - dst_data, inputIndex, input_stride, src_data); - // Update index. - for (int j = 0; j < num_squeezed_dims; ++j) { - if (++block_iter_state[j].count < block_iter_state[j].size) { - inputIndex += block_iter_state[j].input_stride; - outputIndex += block_iter_state[j].output_stride; - break; - } - block_iter_state[j].count = 0; - inputIndex -= block_iter_state[j].input_span; - outputIndex -= block_iter_state[j].output_span; - } - } - } -}; - -/** - * \class TensorBlockReader - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor block reader class. - * - * This class is responsible for reading a tensor block. - * - */ -template -class TensorBlockReader : public TensorBlockIO { - public: - typedef TensorBlock Block; - typedef TensorBlockIO Base; - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - Block* block, const Scalar* src_data) { - array tensor_to_block_dim_map; - for (int i = 0; i < NumDims; ++i) { - tensor_to_block_dim_map[i] = i; - } - Base::Copy(*block, block->first_coeff_index(), tensor_to_block_dim_map, - block->tensor_strides(), src_data, block->data()); - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - Block* block, StorageIndex first_coeff_index, - const array& tensor_to_block_dim_map, - const array& tensor_strides, const Scalar* src_data) { - Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map, - tensor_strides, src_data, block->data()); - } -}; - -/** - * \class TensorBlockWriter - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor block writer class. - * - * This class is responsible for writing a tensor block. - * - */ -template -class TensorBlockWriter : public TensorBlockIO { - public: - typedef TensorBlock Block; - typedef TensorBlockIO Base; - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const Block& block, Scalar* dst_data) { - array tensor_to_block_dim_map; - for (int i = 0; i < NumDims; ++i) { - tensor_to_block_dim_map[i] = i; - } - Base::Copy(block, block.first_coeff_index(), tensor_to_block_dim_map, - block.tensor_strides(), block.data(), dst_data); - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const Block& block, StorageIndex first_coeff_index, - const array& tensor_to_block_dim_map, - const array& tensor_strides, Scalar* dst_data) { - Base::Copy(block, first_coeff_index, tensor_to_block_dim_map, - tensor_strides, block.data(), dst_data); - } -}; - -/** - * \class TensorBlockCwiseUnaryOp - * \ingroup CXX11_Tensor_Module - * - * \brief Carries out a cwise binary op on a number of coefficients. - * - * This class reads strided input from the argument, and writes the - * result of the cwise unary op to the strided output array. - * - */ -template -struct TensorBlockCwiseUnaryOp { - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const UnaryFunctor& functor, const StorageIndex num_coeff, - const StorageIndex output_index, const StorageIndex output_stride, - OutputScalar* output_data, const StorageIndex input_index, - const StorageIndex input_stride, const InputScalar* input_data) { - typedef const Array Input; - typedef Array Output; - - typedef Map > InputMap; - typedef Map > OutputMap; - - const InputScalar* input_base = &input_data[input_index]; - OutputScalar* output_base = &output_data[output_index]; - - const InputMap input(input_base, num_coeff, InnerStride<>(input_stride)); - OutputMap output(output_base, num_coeff, InnerStride<>(output_stride)); - - output = CwiseUnaryOp(input, functor); - } -}; - -template<> -struct TensorBlockCwiseUnaryOp { - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const UnaryFunctor& functor, const StorageIndex num_coeff, - const StorageIndex output_index, const StorageIndex output_stride, - OutputScalar* output_data, const StorageIndex input_index, - const StorageIndex input_stride, const InputScalar* input_data) { - if (input_stride == 1 && output_stride == 1) { - typedef const Array Input; - typedef Array Output; - - const Map input(&input_data[input_index], num_coeff); - Map output(&output_data[output_index], num_coeff); - - output = CwiseUnaryOp >(input, functor); - } else { - TensorBlockCwiseUnaryOp::Run( - functor, num_coeff, output_index, output_stride, output_data, - input_index, input_stride, input_data); - } - } -}; - -/** - * \class TensorBlockCwiseUnaryIO - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor block IO class for carrying out cwise unary ops. - * - * This class carries out the unary op on given blocks. - */ -template -struct TensorBlockCwiseUnaryIO { - typedef typename TensorBlock::Dimensions Dimensions; - - typedef TensorBlockCwiseUnaryOp< - packet_traits::Vectorizable && - functor_traits::PacketAccess> - TensorBlockCwiseUnaryOpImpl; - - struct BlockIteratorState { - StorageIndex output_stride, output_span; - StorageIndex input_stride, input_span; - StorageIndex size, count; - }; - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const UnaryFunctor& functor, const Dimensions& block_sizes, - const Dimensions& block_strides, OutputScalar* output_data, - const array& input_strides, - const InputScalar* input_data) { - // Find the innermost dimension whose size is not 1. This is the effective - // inner dim. If all dimensions are of size 1, fallback to using the actual - // innermost dim to avoid out-of-bound access. - int num_size_one_inner_dims = 0; - for (int i = 0; i < NumDims; ++i) { - const int dim = cond()(i, NumDims - i - 1); - if (block_sizes[dim] != 1) { - num_size_one_inner_dims = i; - break; - } - } - // Calculate strides and dimensions. - const int inner_dim = - NumDims == 0 ? 1 - : cond()(num_size_one_inner_dims, - NumDims - num_size_one_inner_dims - 1); - StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim]; - for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) { - const int dim = cond()(i, NumDims - i - 1); - // Merge multiple inner dims into one for larger inner dim size (i.e. - // fewer calls to TensorBlockCwiseUnaryOp::Run()). - if (inner_dim_size == block_strides[dim] && - block_strides[dim] == input_strides[dim]) { - inner_dim_size *= block_sizes[dim]; - ++num_size_one_inner_dims; - } else { - break; - } - } - - StorageIndex output_index = 0, input_index = 0; - - const StorageIndex output_stride = - NumDims == 0 ? 1 : block_strides[inner_dim]; - const StorageIndex input_stride = - NumDims == 0 ? 1 : input_strides[inner_dim]; - - const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; - array block_iter_state; - - // Initialize block iterator state. Squeeze away any dimension of size 1. - int num_squeezed_dims = 0; - for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { - const int dim = cond()(i + 1, NumDims - i - 2); - const StorageIndex size = block_sizes[dim]; - if (size == 1) { - continue; - } - BlockIteratorState& state = block_iter_state[num_squeezed_dims]; - state.output_stride = block_strides[dim]; - state.input_stride = input_strides[dim]; - state.size = size; - state.output_span = state.output_stride * (size - 1); - state.input_span = state.input_stride * (size - 1); - state.count = 0; - ++num_squeezed_dims; - } - - // Compute cwise unary op. - const StorageIndex block_total_size = - NumDims == 0 ? 1 : block_sizes.TotalSize(); - for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) { - TensorBlockCwiseUnaryOpImpl::Run(functor, inner_dim_size, output_index, - output_stride, output_data, input_index, - input_stride, input_data); - // Update index. - for (int j = 0; j < num_squeezed_dims; ++j) { - BlockIteratorState& state = block_iter_state[j]; - if (++state.count < state.size) { - output_index += state.output_stride; - input_index += state.input_stride; - break; - } - state.count = 0; - output_index -= state.output_span; - input_index -= state.input_span; - } - } - } -}; - -/** - * \class TensorBlockCwiseBinaryOp - * \ingroup CXX11_Tensor_Module - * - * \brief Carries out a cwise binary op on a number of coefficients. - * - * This class reads strided inputs from left and right operands, and writes the - * result of the cwise binary op to the strided output array. - * - */ -template -struct TensorBlockCwiseBinaryOp { - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const BinaryFunctor& functor, const StorageIndex num_coeff, - const StorageIndex output_index, const StorageIndex output_stride, - OutputScalar* output_data, const StorageIndex left_index, - const StorageIndex left_stride, const LeftScalar* left_data, - const StorageIndex right_index, const StorageIndex right_stride, - const RightScalar* right_data) { - typedef const Array Lhs; - typedef const Array Rhs; - typedef Array Out; - - typedef Map > LhsMap; - typedef Map > RhsMap; - typedef Map > OutMap; - - const LeftScalar* lhs_base = &left_data[left_index]; - const RightScalar* rhs_base = &right_data[right_index]; - OutputScalar* out_base = &output_data[output_index]; - - const LhsMap lhs(lhs_base, num_coeff, InnerStride<>(left_stride)); - const RhsMap rhs(rhs_base, num_coeff, InnerStride<>(right_stride)); - OutMap out(out_base, num_coeff, InnerStride<>(output_stride)); - - out = CwiseBinaryOp(lhs, rhs, functor); - } -}; - -template<> -struct TensorBlockCwiseBinaryOp { - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const BinaryFunctor& functor, const StorageIndex num_coeff, - const StorageIndex output_index, const StorageIndex output_stride, - OutputScalar* output_data, const StorageIndex left_index, - const StorageIndex left_stride, const LeftScalar* left_data, - const StorageIndex right_index, const StorageIndex right_stride, - const RightScalar* right_data) { - if (left_stride == 1 && right_stride == 1 && output_stride == 1) { - typedef const Array Lhs; - typedef const Array Rhs; - typedef Array Out; - - const LeftScalar* lhs_base = &left_data[left_index]; - const RightScalar* rhs_base = &right_data[right_index]; - OutputScalar* out_base = &output_data[output_index]; - - const Map lhs(lhs_base, num_coeff); - const Map rhs(rhs_base, num_coeff); - Map out(out_base, num_coeff); - - out = CwiseBinaryOp, Map >(lhs, rhs, functor); - } else { - TensorBlockCwiseBinaryOp::Run( - functor, num_coeff, output_index, output_stride, output_data, - left_index, left_stride, left_data, right_index, right_stride, - right_data); - } - } -}; - -/** - * \class TensorBlockCwiseBinaryIO - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor block IO class for carrying out cwise binary ops. - * - * This class carries out the binary op on given blocks. - * - */ -template -struct TensorBlockCwiseBinaryIO { - typedef typename TensorBlock::Dimensions Dimensions; - - typedef TensorBlockCwiseBinaryOp< - packet_traits::Vectorizable && - functor_traits::PacketAccess> - TensorBlockCwiseBinaryOpImpl; - - struct BlockIteratorState { - StorageIndex output_stride, output_span; - StorageIndex left_stride, left_span; - StorageIndex right_stride, right_span; - StorageIndex size, count; - }; - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const BinaryFunctor& functor, const Dimensions& block_sizes, - const Dimensions& block_strides, OutputScalar* output_data, - const array& left_strides, - const LeftScalar* left_data, - const array& right_strides, - const RightScalar* right_data) { - // Find the innermost dimension whose size is not 1. This is the effective - // inner dim. If all dimensions are of size 1, fallback to using the actual - // innermost dim to avoid out-of-bound access. - int num_size_one_inner_dims = 0; - for (int i = 0; i < NumDims; ++i) { - const int dim = cond()(i, NumDims - i - 1); - if (block_sizes[dim] != 1) { - num_size_one_inner_dims = i; - break; - } - } - // Calculate strides and dimensions. - const int inner_dim = - NumDims == 0 ? 1 - : cond()(num_size_one_inner_dims, - NumDims - num_size_one_inner_dims - 1); - StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim]; - for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) { - const int dim = cond()(i, NumDims - i - 1); - // Merge multiple inner dims into one for larger inner dim size (i.e. - // fewer calls to TensorBlockCwiseBinaryOp::Run()). - if (inner_dim_size == block_strides[dim] && - block_strides[dim] == left_strides[dim] && - block_strides[dim] == right_strides[dim]) { - inner_dim_size *= block_sizes[dim]; - ++num_size_one_inner_dims; - } else { - break; - } - } - - StorageIndex output_index = 0, left_index = 0, right_index = 0; - const StorageIndex output_stride = - NumDims == 0 ? 1 : block_strides[inner_dim]; - const StorageIndex left_stride = NumDims == 0 ? 1 : left_strides[inner_dim]; - const StorageIndex right_stride = - NumDims == 0 ? 1 : right_strides[inner_dim]; - - const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; - array block_iter_state; - - // Initialize block iterator state. Squeeze away any dimension of size 1. - int num_squeezed_dims = 0; - for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { - const int dim = cond()(i + 1, NumDims - i - 2); - const StorageIndex size = block_sizes[dim]; - if (size == 1) { - continue; - } - BlockIteratorState& state = block_iter_state[num_squeezed_dims]; - state.output_stride = block_strides[dim]; - state.left_stride = left_strides[dim]; - state.right_stride = right_strides[dim]; - state.size = size; - state.output_span = state.output_stride * (size - 1); - state.left_span = state.left_stride * (size - 1); - state.right_span = state.right_stride * (size - 1); - state.count = 0; - ++num_squeezed_dims; - } - - // Compute cwise binary op. - const StorageIndex block_total_size = - NumDims == 0 ? 1 : block_sizes.TotalSize(); - for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) { - TensorBlockCwiseBinaryOpImpl::Run(functor, inner_dim_size, output_index, - output_stride, output_data, left_index, - left_stride, left_data, right_index, - right_stride, right_data); - // Update index. - for (int j = 0; j < num_squeezed_dims; ++j) { - BlockIteratorState& state = block_iter_state[j]; - if (++state.count < state.size) { - output_index += state.output_stride; - left_index += state.left_stride; - right_index += state.right_stride; - break; - } - state.count = 0; - output_index -= state.output_span; - left_index -= state.left_span; - right_index -= state.right_span; - } - } - } -}; - -/** - * \class TensorBlockView - * \ingroup CXX11_Tensor_Module - * - * \brief Read-only view into a block of data. - * - * This class provides read-only access to a block of data in impl. It may need - * to allocate space for holding the intermediate result. - * - */ -template -struct TensorBlockView { - typedef TensorEvaluator Impl; - typedef typename Impl::Index StorageIndex; - typedef typename remove_const::type Scalar; - static const int NumDims = array_size::value; - typedef DSizes Dimensions; - - // Constructs a TensorBlockView for `impl`. `block` is only used for for - // specifying the start offset, shape, and strides of the block. - template - TensorBlockView(const Device& device, - const TensorEvaluator& impl, - const OtherTensorBlock& block) - : m_device(device), - m_block_sizes(block.block_sizes()), - m_data(NULL), - m_allocated_data(NULL) { - if (Impl::RawAccess && impl.data() != NULL) { - m_data = impl.data() + block.first_coeff_index(); - m_block_strides = block.tensor_strides(); - } else { - // Actually make a copy. - - // TODO(wuke): This sometimes put a lot pressure on the heap allocator. - // Consider allowing ops to request additional temporary block memory in - // TensorOpResourceRequirements. - m_allocated_data = static_cast( - m_device.allocate(m_block_sizes.TotalSize() * sizeof(Scalar))); - m_data = m_allocated_data; - if (NumDims > 0) { - if (static_cast(Impl::Layout) == static_cast(ColMajor)) { - m_block_strides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_block_strides[i] = m_block_strides[i - 1] * m_block_sizes[i - 1]; - } - } else { - m_block_strides[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_block_strides[i] = m_block_strides[i + 1] * m_block_sizes[i + 1]; - } - } - } - TensorBlock input_block( - block.first_coeff_index(), m_block_sizes, m_block_strides, - block.tensor_strides(), m_allocated_data); - impl.block(&input_block); - } - } - - ~TensorBlockView() { - if (m_allocated_data != NULL) { - m_device.deallocate(m_allocated_data); - } - } - - const Dimensions& block_sizes() const { return m_block_sizes; } - const Dimensions& block_strides() const { return m_block_strides; } - const Scalar* data() const { return m_data; } - - private: - const Device EIGEN_DEVICE_REF m_device; - Dimensions m_block_sizes, m_block_strides; - const Scalar* m_data; // Not owned. - Scalar* m_allocated_data; // Owned. -}; - /** * \class TensorBlockMapper * \ingroup CXX11_Tensor_Module @@ -1108,137 +332,6 @@ class TensorBlockMapper { StorageIndex m_total_block_count; }; -/** - * \class TensorSliceBlockMapper - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor slice block mapper class. - * - * This class is responsible for iterating over the blocks of - * a slice of a tensor. Supports shuffling of the block strides - * for callers that want to reduce strides for dimensions to be - * processed together. - * - */ -template -class TensorSliceBlockMapper { - public: - typedef TensorBlock Block; - typedef DSizes Dimensions; - - TensorSliceBlockMapper(const Dimensions& tensor_dims, - const Dimensions& tensor_slice_offsets, - const Dimensions& tensor_slice_extents, - const Dimensions& block_dim_sizes, - const Dimensions& block_stride_order) - : m_tensor_dimensions(tensor_dims), - m_tensor_slice_offsets(tensor_slice_offsets), - m_tensor_slice_extents(tensor_slice_extents), - m_block_dim_sizes(block_dim_sizes), - m_block_stride_order(block_stride_order), - m_total_block_count(1) { - // Calculate block counts by dimension and total block count. - DSizes block_count; - for (Index i = 0; i < block_count.rank(); ++i) { - block_count[i] = divup(m_tensor_slice_extents[i], m_block_dim_sizes[i]); - } - m_total_block_count = array_prod(block_count); - - // Calculate block strides (used for enumerating blocks). - if (static_cast(Layout) == static_cast(ColMajor)) { - m_block_strides[0] = 1; - m_tensor_strides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_block_strides[i] = m_block_strides[i - 1] * block_count[i - 1]; - m_tensor_strides[i] = - m_tensor_strides[i - 1] * m_tensor_dimensions[i - 1]; - } - } else { - m_block_strides[NumDims - 1] = 1; - m_tensor_strides[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_block_strides[i] = m_block_strides[i + 1] * block_count[i + 1]; - m_tensor_strides[i] = - m_tensor_strides[i + 1] * m_tensor_dimensions[i + 1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Block - GetBlockForIndex(StorageIndex block_index, Scalar* data) const { - StorageIndex first_coeff_index = 0; - DSizes coords; - DSizes sizes; - DSizes strides; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = block_index / m_block_strides[i]; - coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i]; - sizes[i] = numext::mini( - m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i], - m_block_dim_sizes[i]); - block_index -= idx * m_block_strides[i]; - first_coeff_index += coords[i] * m_tensor_strides[i]; - } - coords[0] = - m_tensor_slice_offsets[0] + block_index * m_block_dim_sizes[0]; - sizes[0] = numext::mini( - m_tensor_slice_offsets[0] + m_tensor_slice_extents[0] - coords[0], - m_block_dim_sizes[0]); - first_coeff_index += coords[0] * m_tensor_strides[0]; - - StorageIndex prev_dim = m_block_stride_order[0]; - strides[prev_dim] = 1; - for (int i = 1; i < NumDims; ++i) { - const StorageIndex curr_dim = m_block_stride_order[i]; - strides[curr_dim] = strides[prev_dim] * sizes[prev_dim]; - prev_dim = curr_dim; - } - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const StorageIndex idx = block_index / m_block_strides[i]; - coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i]; - sizes[i] = numext::mini( - m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i], - m_block_dim_sizes[i]); - block_index -= idx * m_block_strides[i]; - first_coeff_index += coords[i] * m_tensor_strides[i]; - } - coords[NumDims - 1] = m_tensor_slice_offsets[NumDims - 1] + - block_index * m_block_dim_sizes[NumDims - 1]; - sizes[NumDims - 1] = numext::mini( - m_tensor_slice_offsets[NumDims - 1] + - m_tensor_slice_extents[NumDims - 1] - coords[NumDims - 1], - m_block_dim_sizes[NumDims - 1]); - first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1]; - - StorageIndex prev_dim = m_block_stride_order[NumDims - 1]; - strides[prev_dim] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - const StorageIndex curr_dim = m_block_stride_order[i]; - strides[curr_dim] = strides[prev_dim] * sizes[prev_dim]; - prev_dim = curr_dim; - } - } - - return Block(first_coeff_index, sizes, strides, m_tensor_strides, data); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const { - return m_total_block_count; - } - - private: - Dimensions m_tensor_dimensions; - Dimensions m_tensor_slice_offsets; - Dimensions m_tensor_slice_extents; - Dimensions m_tensor_strides; - Dimensions m_block_dim_sizes; - Dimensions m_block_stride_order; - Dimensions m_block_strides; - StorageIndex m_total_block_count; -}; - } // namespace internal } // namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 58164c13a..80162ad12 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -114,7 +114,6 @@ struct TensorEvaluator, Device> enum { IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, BlockAccessV2 = TensorEvaluator::BlockAccessV2, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, @@ -123,21 +122,10 @@ struct TensorEvaluator, Device> typedef typename internal::remove_const::type ScalarNoConst; - // Block based access to the XprType (input) tensor. - typedef internal::TensorBlock - TensorBlock; - typedef internal::TensorBlockReader - TensorBlockReader; - // We do block based broadcasting using a trick with 2x tensor rank and 0 // strides. See block method implementation for details. typedef DSizes BroadcastDimensions; - typedef internal::TensorBlock - BroadcastTensorBlock; - typedef internal::TensorBlockReader - BroadcastTensorBlockReader; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// typedef internal::TensorBlockDescriptor TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; @@ -641,246 +629,6 @@ struct TensorEvaluator, Device> m_impl.getResourceRequirements(resources); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( - TensorBlock* output_block) const { - if (NumDims <= 0) { - output_block->data()[0] = m_impl.coeff(0); - return; - } - - // Because we only support kSkewedInnerDims blocking, block size should be - // equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size - // for the first outer dim, and 1 for other outer dims. This is guaranteed - // by MergeResourceRequirements() in TensorBlock.h. - const Dimensions& output_block_sizes = output_block->block_sizes(); - const Dimensions& output_block_strides = output_block->block_strides(); - - // Find where outer dims start. - int outer_dim_start = 0; - Index outer_dim_size = 1, inner_dim_size = 1; - for (int i = 0; i < NumDims; ++i) { - const int dim = static_cast(Layout) == static_cast(ColMajor) - ? i - : NumDims - i - 1; - if (i > outer_dim_start) { - eigen_assert(output_block_sizes[dim] == 1); - } else if (output_block_sizes[dim] != m_dimensions[dim]) { - eigen_assert(output_block_sizes[dim] < m_dimensions[dim]); - outer_dim_size = output_block_sizes[dim]; - } else { - inner_dim_size *= output_block_sizes[dim]; - ++outer_dim_start; - } - } - - if (inner_dim_size == 0 || outer_dim_size == 0) { - return; - } - - const Dimensions& input_dims = Dimensions(m_impl.dimensions()); - - // Pre-fill input_block_sizes, broadcast_block_sizes, - // broadcast_block_strides, and broadcast_tensor_strides. Later on we will - // only modify the outer_dim_start-th dimension on these arrays. - - // Calculate the input block size for looking into the input. - Dimensions input_block_sizes; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < outer_dim_start; ++i) { - input_block_sizes[i] = input_dims[i]; - } - for (int i = outer_dim_start; i < NumDims; ++i) { - input_block_sizes[i] = 1; - } - } else { - for (int i = 0; i < outer_dim_start; ++i) { - input_block_sizes[NumDims - i - 1] = input_dims[NumDims - i - 1]; - } - for (int i = outer_dim_start; i < NumDims; ++i) { - input_block_sizes[NumDims - i - 1] = 1; - } - } - - // Broadcast with the 0-stride trick: Create 1 extra dim for each - // broadcast, set the input stride to 0. - // - // When ColMajor: - // - broadcast_block_sizes is [d_0, b_0, d_1, b_1, ...]. - // - // - broadcast_block_strides is [output_block_strides[0], - // output_block_strides[0] * d_0, - // output_block_strides[1], - // output_block_strides[1] * d_1, - // ...]. - // - // - broadcast_tensor_strides is [output_block_strides[0], - // 0, - // output_block_strides[1], - // 0, - // ...]. - BroadcastDimensions broadcast_block_sizes, broadcast_block_strides, - broadcast_tensor_strides; - - for (int i = 0; i < outer_dim_start; ++i) { - const int dim = static_cast(Layout) == static_cast(ColMajor) - ? i - : NumDims - i - 1; - const int copy_dim = - static_cast(Layout) == static_cast(ColMajor) - ? 2 * i - : 2 * NumDims - 2 * i - 1; - const int broadcast_dim = - static_cast(Layout) == static_cast(ColMajor) ? copy_dim + 1 - : copy_dim - 1; - broadcast_block_sizes[copy_dim] = input_dims[dim]; - broadcast_block_sizes[broadcast_dim] = m_broadcast[dim]; - broadcast_block_strides[copy_dim] = output_block_strides[dim]; - broadcast_block_strides[broadcast_dim] = - output_block_strides[dim] * input_dims[dim]; - broadcast_tensor_strides[copy_dim] = m_inputStrides[dim]; - broadcast_tensor_strides[broadcast_dim] = 0; - } - for (int i = 2 * outer_dim_start; i < 2 * NumDims; ++i) { - const int dim = static_cast(Layout) == static_cast(ColMajor) - ? i - : 2 * NumDims - i - 1; - broadcast_block_sizes[dim] = 1; - broadcast_block_strides[dim] = 0; - broadcast_tensor_strides[dim] = 0; - } - - const int outer_dim = static_cast(Layout) == static_cast(ColMajor) - ? outer_dim_start - : NumDims - outer_dim_start - 1; - - if (outer_dim_size == 1) { - // We just need one block read using the ready-set values above. - BroadcastBlock(input_block_sizes, broadcast_block_sizes, - broadcast_block_strides, broadcast_tensor_strides, 0, - output_block); - } else if (input_dims[outer_dim] == 1) { - // Broadcast outer_dim_start-th dimension (< NumDims) by outer_dim_size. - const int broadcast_outer_dim = - static_cast(Layout) == static_cast(ColMajor) - ? 2 * outer_dim_start + 1 - : 2 * NumDims - 2 * outer_dim_start - 2; - broadcast_block_sizes[broadcast_outer_dim] = outer_dim_size; - broadcast_tensor_strides[broadcast_outer_dim] = 0; - broadcast_block_strides[broadcast_outer_dim] = - output_block_strides[outer_dim]; - BroadcastBlock(input_block_sizes, broadcast_block_sizes, - broadcast_block_strides, broadcast_tensor_strides, 0, - output_block); - } else { - // The general case. Let's denote the output block as x[..., - // a:a+outer_dim_size, :, ..., :], where a:a+outer_dim_size is a slice on - // the outer_dim_start-th dimension (< NumDims). We need to split the - // a:a+outer_dim_size into possibly 3 sub-blocks: - // - // (1) a:b, where b is the smallest multiple of - // input_dims[outer_dim_start] in [a, a+outer_dim_size]. - // - // (2) b:c, where c is the largest multiple of input_dims[outer_dim_start] - // in [a, a+outer_dim_size]. - // - // (3) c:a+outer_dim_size . - // - // Or, when b and c do not exist, we just need to process the whole block - // together. - - // Find a. - const Index outer_dim_left_index = - output_block->first_coeff_index() / m_outputStrides[outer_dim]; - - // Find b and c. - const Index input_outer_dim_size = input_dims[outer_dim]; - - // First multiple after a. This is b when <= outer_dim_left_index + - // outer_dim_size. - const Index first_multiple = - divup(outer_dim_left_index, input_outer_dim_size) * - input_outer_dim_size; - - if (first_multiple <= outer_dim_left_index + outer_dim_size) { - // b exists, so does c. Find it. - const Index last_multiple = (outer_dim_left_index + outer_dim_size) / - input_outer_dim_size * input_outer_dim_size; - const int copy_outer_dim = - static_cast(Layout) == static_cast(ColMajor) - ? 2 * outer_dim_start - : 2 * NumDims - 2 * outer_dim_start - 1; - const int broadcast_outer_dim = - static_cast(Layout) == static_cast(ColMajor) - ? 2 * outer_dim_start + 1 - : 2 * NumDims - 2 * outer_dim_start - 2; - if (first_multiple > outer_dim_left_index) { - const Index head_size = first_multiple - outer_dim_left_index; - input_block_sizes[outer_dim] = head_size; - broadcast_block_sizes[copy_outer_dim] = head_size; - broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim]; - broadcast_block_strides[copy_outer_dim] = - output_block_strides[outer_dim]; - broadcast_block_sizes[broadcast_outer_dim] = 1; - broadcast_tensor_strides[broadcast_outer_dim] = 0; - broadcast_block_strides[broadcast_outer_dim] = - output_block_strides[outer_dim] * input_dims[outer_dim]; - BroadcastBlock(input_block_sizes, broadcast_block_sizes, - broadcast_block_strides, broadcast_tensor_strides, 0, - output_block); - } - if (first_multiple < last_multiple) { - input_block_sizes[outer_dim] = input_outer_dim_size; - broadcast_block_sizes[copy_outer_dim] = input_outer_dim_size; - broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim]; - broadcast_block_strides[copy_outer_dim] = - output_block_strides[outer_dim]; - broadcast_block_sizes[broadcast_outer_dim] = - (last_multiple - first_multiple) / input_outer_dim_size; - broadcast_tensor_strides[broadcast_outer_dim] = 0; - broadcast_block_strides[broadcast_outer_dim] = - output_block_strides[outer_dim] * input_dims[outer_dim]; - const Index offset = (first_multiple - outer_dim_left_index) * - m_outputStrides[outer_dim]; - BroadcastBlock(input_block_sizes, broadcast_block_sizes, - broadcast_block_strides, broadcast_tensor_strides, - offset, output_block); - } - if (last_multiple < outer_dim_left_index + outer_dim_size) { - const Index tail_size = - outer_dim_left_index + outer_dim_size - last_multiple; - input_block_sizes[outer_dim] = tail_size; - broadcast_block_sizes[copy_outer_dim] = tail_size; - broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim]; - broadcast_block_strides[copy_outer_dim] = - output_block_strides[outer_dim]; - broadcast_block_sizes[broadcast_outer_dim] = 1; - broadcast_tensor_strides[broadcast_outer_dim] = 0; - broadcast_block_strides[broadcast_outer_dim] = - output_block_strides[outer_dim] * input_dims[outer_dim]; - const Index offset = (last_multiple - outer_dim_left_index) * - m_outputStrides[outer_dim]; - BroadcastBlock(input_block_sizes, broadcast_block_sizes, - broadcast_block_strides, broadcast_tensor_strides, - offset, output_block); - } - } else { - // b and c do not exist. - const int copy_outer_dim = - static_cast(Layout) == static_cast(ColMajor) - ? 2 * outer_dim_start - : 2 * NumDims - 2 * outer_dim_start - 1; - input_block_sizes[outer_dim] = outer_dim_size; - broadcast_block_sizes[copy_outer_dim] = outer_dim_size; - broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim]; - broadcast_block_strides[copy_outer_dim] = - output_block_strides[outer_dim]; - BroadcastBlock(input_block_sizes, broadcast_block_sizes, - broadcast_block_strides, broadcast_tensor_strides, 0, - output_block); - } - } - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { @@ -1096,28 +844,6 @@ struct TensorEvaluator, Device> return params; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlock( - const Dimensions& input_block_sizes, - const BroadcastDimensions& broadcast_block_sizes, - const BroadcastDimensions& broadcast_block_strides, - const BroadcastDimensions& broadcast_tensor_strides, Index offset, - TensorBlock* output_block) const { - TensorBlock input_view_block( - static_cast(Layout) == static_cast(ColMajor) - ? indexColMajor(output_block->first_coeff_index() + offset) - : indexRowMajor(output_block->first_coeff_index() + offset), - input_block_sizes, Dimensions(m_inputStrides), - Dimensions(m_inputStrides), NULL); - - internal::TensorBlockView input_block(m_device, m_impl, - input_view_block); - BroadcastTensorBlock broadcast_block( - 0, broadcast_block_sizes, broadcast_block_strides, - broadcast_tensor_strides, output_block->data() + offset); - - BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 emptyBlock() const { DSizes dimensions; for (int i = 0; i < NumDims; ++i) dimensions[i] = 0; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 32d6960bf..098110217 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -148,7 +148,6 @@ struct TensorEvaluator, Device> IsAligned = false, Layout = TensorEvaluator::Layout, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, BlockAccessV2 = TensorEvaluator::BlockAccessV2, // Chipping of outer-most dimension is a trivial operation, because we can // read and write directly from the underlying tensor using single offset. @@ -167,11 +166,6 @@ struct TensorEvaluator, Device> typedef typename internal::remove_const::type ScalarNoConst; - typedef internal::TensorBlock - InputTensorBlock; - typedef internal::TensorBlock - OutputTensorBlock; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// typedef internal::TensorBlockDescriptor TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; @@ -218,20 +212,6 @@ struct TensorEvaluator, Device> } m_inputStride *= input_dims[m_dim.actualDim()]; m_inputOffset = m_stride * op.offset(); - - if (BlockAccess) { - if (static_cast(Layout) == static_cast(ColMajor)) { - m_inputStrides[0] = 1; - for (int i = 1; i < NumInputDims; ++i) { - m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; - } - } else { - m_inputStrides[NumInputDims - 1] = 1; - for (int i = NumInputDims - 2; i >= 0; --i) { - m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; - } - } - } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -323,52 +303,6 @@ struct TensorEvaluator, Device> m_impl.getResourceRequirements(resources); } - // TODO(andydavis) Reduce the overhead of this function (experiment with - // using a fixed block size). - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( - OutputTensorBlock* output_block) const { - // Calculate input block sizes. - const DSizes& output_block_sizes = - output_block->block_sizes(); - const DSizes& output_block_strides = - output_block->block_strides(); - const Index chip_dim = m_dim.actualDim(); - DSizes input_block_sizes; - DSizes input_block_strides; - for (Index i = 0; i < NumInputDims; ++i) { - if (i < chip_dim) { - input_block_sizes[i] = output_block_sizes[i]; - input_block_strides[i] = output_block_strides[i]; - } else if (i > chip_dim) { - input_block_sizes[i] = output_block_sizes[i - 1]; - input_block_strides[i] = output_block_strides[i - 1]; - } else { - input_block_sizes[i] = 1; - } - } - // Fix up input_block_stride for chip dimension. - if (static_cast(Layout) == static_cast(ColMajor)) { - if (chip_dim == 0) { - input_block_strides[chip_dim] = 1; - } else { - input_block_strides[chip_dim] = - input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1]; - } - } else { - if (chip_dim == NumInputDims - 1) { - input_block_strides[chip_dim] = 1; - } else { - input_block_strides[chip_dim] = - input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1]; - } - } - // Instantiate and read input block from input tensor. - InputTensorBlock input_block(srcCoeff(output_block->first_coeff_index()), - input_block_sizes, input_block_strides, - m_inputStrides, output_block->data()); - m_impl.block(&input_block); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool root_of_expr_ast = false) const { @@ -482,7 +416,6 @@ struct TensorEvaluator, Device> Index m_stride; Index m_inputOffset; Index m_inputStride; - DSizes m_inputStrides; TensorEvaluator m_impl; const internal::DimensionId m_dim; const Device EIGEN_DEVICE_REF m_device; @@ -508,7 +441,6 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, BlockAccessV2 = TensorEvaluator::RawAccess, Layout = TensorEvaluator::Layout, RawAccess = false diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 26276abaf..aad9d86be 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -125,7 +125,6 @@ struct TensorEvaluator::PacketAccess && TensorEvaluator::PacketAccess, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess || TensorEvaluator::PreferBlockAccess, @@ -325,7 +324,6 @@ template::PacketAccess && TensorEvaluator::PacketAccess, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess || TensorEvaluator::PreferBlockAccess, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 87e8db3fd..4bc7b3942 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -381,7 +381,6 @@ struct TensorContractionEvaluatorBase enum { IsAligned = true, PacketAccess = (PacketType::size > 1), - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 2a6d67ad5..027322582 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -302,7 +302,6 @@ struct TensorEvaluator, Device> TensorEvaluator::PacketAccess & internal::type_casting_traits::VectorizedCast, #endif - BlockAccess = false, BlockAccessV2 = TensorEvaluator::BlockAccessV2, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 8220038c1..44068fedc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -309,7 +309,6 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, @@ -787,7 +786,6 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index b660242f4..5c94165d1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -242,7 +242,6 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index f1f46161e..242533f72 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -95,7 +95,6 @@ struct TensorEvaluator, Devi enum { IsAligned = false, PacketAccess = (PacketType::size > 1), - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, @@ -269,7 +268,6 @@ struct TensorEvaluator::size > 1), - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index cd1338c66..722032a3a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -110,7 +110,6 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = true, BlockAccessV2 = true, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, @@ -120,9 +119,6 @@ struct TensorEvaluator, Device> static const int NumDims = internal::traits::NumDimensions; - typedef typename internal::TensorBlock TensorBlock; - typedef typename internal::TensorBlockReader TensorBlockReader; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// typedef internal::TensorBlockDescriptor TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; @@ -173,13 +169,6 @@ struct TensorEvaluator, Device> m_impl.getResourceRequirements(resources); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) { - TensorBlock eval_to_block(block->first_coeff_index(), block->block_sizes(), - block->tensor_strides(), block->tensor_strides(), - m_buffer + block->first_coeff_index()); - m_impl.block(&eval_to_block); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2( TensorBlockDesc& desc, TensorBlockScratch& scratch) { // Add `m_buffer` as destination buffer to the block descriptor. @@ -216,11 +205,6 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const { - assert(m_buffer != NULL); - TensorBlockReader::Run(block, m_buffer); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { // We assume that evalPacket or evalScalar is called to perform the // assignment and account for the cost of the write here. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index ce2305b56..d6a3e6abe 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -45,7 +45,6 @@ struct TensorEvaluator enum { IsAligned = Derived::IsAligned, PacketAccess = (PacketType::size > 1), - BlockAccess = internal::is_arithmetic::type>::value, BlockAccessV2 = internal::is_arithmetic::type>::value, PreferBlockAccess = false, Layout = Derived::Layout, @@ -55,13 +54,6 @@ struct TensorEvaluator typedef typename internal::remove_const::type ScalarNoConst; - typedef typename internal::TensorBlock - TensorBlock; - typedef typename internal::TensorBlockReader - TensorBlockReader; - typedef typename internal::TensorBlockWriter - TensorBlockWriter; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// typedef internal::TensorBlockDescriptor TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; @@ -160,11 +152,6 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector*) const {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const { - assert(m_data != NULL); - TensorBlockReader::Run(block, m_data); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { @@ -172,12 +159,6 @@ struct TensorEvaluator return TensorBlockV2::materialize(m_data, m_dims, desc, scratch); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( - const TensorBlock& block) { - assert(m_data != NULL); - TensorBlockWriter::Run(block, m_data); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2( const TensorBlockDesc& desc, const TensorBlockV2& block) { @@ -263,7 +244,6 @@ struct TensorEvaluator enum { IsAligned = Derived::IsAligned, PacketAccess = (PacketType::size > 1), - BlockAccess = internal::is_arithmetic::value, BlockAccessV2 = internal::is_arithmetic::value, PreferBlockAccess = false, Layout = Derived::Layout, @@ -271,11 +251,6 @@ struct TensorEvaluator RawAccess = true }; - typedef typename internal::TensorBlock - TensorBlock; - typedef typename internal::TensorBlockReader - TensorBlockReader; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// typedef internal::TensorBlockDescriptor TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; @@ -348,11 +323,6 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector*) const {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const { - assert(m_data != NULL); - TensorBlockReader::Run(block, m_data); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { @@ -404,7 +374,6 @@ struct TensorEvaluator, Device> && (PacketType::size >1) #endif , - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, @@ -475,7 +444,6 @@ struct TensorEvaluator, Device> IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, BlockAccessV2 = TensorEvaluator::BlockAccessV2, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, @@ -554,24 +522,6 @@ struct TensorEvaluator, Device> m_argImpl.getResourceRequirements(resources); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( - TensorBlock* output_block) const { - if (NumDims <= 0) { - output_block->data()[0] = coeff(0); - return; - } - internal::TensorBlockView arg_block(m_device, m_argImpl, - *output_block); - internal::TensorBlockCwiseUnaryIO::Run(m_functor, - output_block->block_sizes(), - output_block - ->block_strides(), - output_block->data(), - arg_block.block_strides(), - arg_block.data()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { @@ -608,8 +558,6 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess & - TensorEvaluator::BlockAccess, BlockAccessV2 = TensorEvaluator::BlockAccessV2 & TensorEvaluator::BlockAccessV2, PreferBlockAccess = TensorEvaluator::PreferBlockAccess | @@ -713,24 +661,6 @@ struct TensorEvaluatordata()[0] = coeff(Index(0)); - return; - } - internal::TensorBlockView left_block( - m_device, m_leftImpl, *output_block); - internal::TensorBlockView right_block( - m_device, m_rightImpl, *output_block); - internal::TensorBlockCwiseBinaryIO< - BinaryOp, Index, typename internal::remove_const::type, NumDims, - Layout>::Run(m_functor, output_block->block_sizes(), - output_block->block_strides(), output_block->data(), - left_block.block_strides(), left_block.data(), - right_block.block_strides(), right_block.data()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { @@ -768,7 +698,6 @@ struct TensorEvaluator::PacketAccess && TensorEvaluator::PacketAccess && internal::functor_traits::PacketAccess, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess || TensorEvaluator::PreferBlockAccess || @@ -887,7 +816,6 @@ struct TensorEvaluator PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & PacketType::HasBlend, - BlockAccess = false, BlockAccessV2 = TensorEvaluator::BlockAccessV2 && TensorEvaluator::BlockAccessV2 && TensorEvaluator::BlockAccessV2, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 0fb0a9227..9926046b9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -153,70 +153,6 @@ class TensorExecutor -class TensorExecutor { - public: - typedef typename traits::Scalar Scalar; - typedef typename remove_const::type ScalarNoConst; - - typedef TensorEvaluator Evaluator; - typedef typename traits::Index StorageIndex; - - static const int NumDims = traits::NumDimensions; - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE void run(const Expression& expr, - const DefaultDevice& device = DefaultDevice()) { - typedef TensorBlock TensorBlock; - typedef TensorBlockMapper TensorBlockMapper; - typedef typename TensorBlock::Dimensions TensorBlockDimensions; - - Evaluator evaluator(expr, device); - Index total_size = array_prod(evaluator.dimensions()); - Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar); - - if (total_size < cache_size - && !ExpressionHasTensorBroadcastingOp::value) { - // TODO(andydavis) Reduce block management overhead for small tensors. - internal::TensorExecutor::run(expr,device); - evaluator.cleanup(); - return; - } - - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) { - // Size tensor blocks to fit in cache (or requested target block size). - Index block_total_size = numext::mini(cache_size, total_size); - TensorBlockShapeType block_shape = kSkewedInnerDims; - // Query expression tree for desired block size/shape. - std::vector resources; - evaluator.getResourceRequirements(&resources); - MergeResourceRequirements(resources, &block_shape, &block_total_size); - - TensorBlockMapper block_mapper( - TensorBlockDimensions(evaluator.dimensions()), block_shape, - block_total_size); - block_total_size = block_mapper.block_dims_total_size(); - - ScalarNoConst* data = static_cast( - device.allocate(block_total_size * sizeof(Scalar))); - - const StorageIndex total_block_count = block_mapper.total_block_count(); - for (StorageIndex i = 0; i < total_block_count; ++i) { - TensorBlock block = block_mapper.GetBlockForIndex(i, data); - evaluator.evalBlock(&block); - } - device.deallocate(data); - } - evaluator.cleanup(); - } -}; - /** * Process all the data with a single cpu thread, using blocks of data. By * sizing a block to fit L1 cache we get better cache performance. @@ -446,59 +382,6 @@ class TensorExecutor { } }; -template -class TensorExecutor { - public: - typedef typename traits::Index StorageIndex; - typedef typename traits::Scalar Scalar; - typedef typename remove_const::type ScalarNoConst; - - static const int NumDims = traits::NumDimensions; - - typedef TensorEvaluator Evaluator; - typedef TensorBlockMapper BlockMapper; - typedef TensorExecutorTilingContext TilingContext; - - static EIGEN_STRONG_INLINE void run(const Expression& expr, - const ThreadPoolDevice& device) { - Evaluator evaluator(expr, device); - Index total_size = array_prod(evaluator.dimensions()); - Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar); - - if (total_size < cache_size && - !ExpressionHasTensorBroadcastingOp::value) { - // TODO(andydavis) Reduce block management overhead for small tensors. - internal::TensorExecutor::run(expr, - device); - evaluator.cleanup(); - return; - } - - const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); - if (needs_assign) { - const TilingContext tiling = - internal::GetTensorExecutorTilingContext(device, evaluator); - - device.parallelFor( - tiling.block_mapper.total_block_count(), tiling.cost, - [=, &device, &evaluator, &tiling](StorageIndex firstIdx, - StorageIndex lastIdx) { - ScalarNoConst* thread_buf = - tiling.template GetCurrentThreadBuffer(device); - for (StorageIndex i = firstIdx; i < lastIdx; ++i) { - auto block = tiling.block_mapper.GetBlockForIndex(i, thread_buf); - evaluator.evalBlock(&block); - } - }); - device.deallocate(tiling.buffer); - } - evaluator.cleanup(); - } -}; - template class TensorExecutor { @@ -603,91 +486,6 @@ class TensorAsyncExecutor -class TensorAsyncExecutor { - public: - typedef typename traits::Index StorageIndex; - typedef typename traits::Scalar Scalar; - typedef typename remove_const::type ScalarNoConst; - - static const int NumDims = traits::NumDimensions; - - typedef TensorEvaluator Evaluator; - typedef TensorBlockMapper - BlockMapper; - typedef TensorExecutorTilingContext TilingContext; - - static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, - const ThreadPoolDevice& device, - DoneCallback done) { - TensorAsyncExecutorContext* const ctx = - new TensorAsyncExecutorContext(expr, device, std::move(done)); - - Index total_size = array_prod(ctx->evaluator.dimensions()); - Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar); - - if (total_size < cache_size && - !ExpressionHasTensorBroadcastingOp::value) { - auto delete_ctx = [ctx]() { delete ctx; }; - internal::TensorAsyncExecutor< - Expression, ThreadPoolDevice, decltype(delete_ctx), Vectorizable, - /*Tileable*/ TiledEvaluation::Off>::runAsync(expr, device, std::move(delete_ctx)); - return; - } - - const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void { - if (!need_assign) { - delete ctx; - return; - } - - ctx->tiling = - GetTensorExecutorTilingContext( - device, ctx->evaluator); - - auto eval_block = [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { - ScalarNoConst* thread_buf = - ctx->tiling.template GetCurrentThreadBuffer( - ctx->device); - for (StorageIndex i = firstIdx; i < lastIdx; ++i) { - auto block = ctx->tiling.block_mapper.GetBlockForIndex(i, thread_buf); - ctx->evaluator.evalBlock(&block); - } - }; - device.parallelForAsync(ctx->tiling.block_mapper.total_block_count(), - ctx->tiling.cost, eval_block, - [ctx]() { delete ctx; }); - }; - - ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); - } - - private: - struct TensorAsyncExecutorContext { - TensorAsyncExecutorContext(const Expression& expr, - const ThreadPoolDevice& thread_pool, - DoneCallback done) - : device(thread_pool), - evaluator(expr, thread_pool), - on_done(std::move(done)) {} - - ~TensorAsyncExecutorContext() { - device.deallocate(tiling.buffer); - evaluator.cleanup(); - on_done(); - } - - const ThreadPoolDevice& device; - Evaluator evaluator; - TilingContext tiling; - - private: - DoneCallback on_done; - }; -}; - template class TensorAsyncExecutor { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 55c7d6831..a8841bc38 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -133,7 +133,6 @@ struct TensorEvaluator, D enum { IsAligned = false, PacketAccess = true, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 5f06c97ab..ea3ea2c91 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -41,7 +41,6 @@ class TensorFixedSize : public TensorBase0), PacketAccess = (internal::packet_traits::size > 1), - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = false, Layout = Options_ & RowMajor ? RowMajor : ColMajor, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index e5b67a18c..8d17d4b76 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -96,7 +96,6 @@ struct TensorEvaluator, Device> enum { IsAligned = true, PacketAccess = (PacketType::size > 1), - BlockAccess = internal::is_arithmetic::value, BlockAccessV2 = internal::is_arithmetic::value, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, @@ -105,11 +104,6 @@ struct TensorEvaluator, Device> static const int NumDims = internal::traits::NumDimensions; - typedef typename internal::TensorBlock - TensorBlock; - typedef typename internal::TensorBlockReader - TensorBlockReader; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// typedef internal::TensorBlockDescriptor TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; @@ -185,11 +179,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( std::vector*) const {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const { - assert(m_buffer != NULL); - TensorBlockReader::Run(block, m_buffer); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 0da2d9e0d..389d5d906 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -158,7 +158,6 @@ struct IsVectorizable { enum TiledEvaluation { Off = 0, // tiled evaluation is not supported On = 1, // still work in progress (see TensorBlockV2.h) - Legacy = 2 // soon to be deprecated (see TensorBock.h) }; template @@ -166,18 +165,12 @@ struct IsTileable { // Check that block evaluation is supported and it's a preferred option (at // least one sub-expression has much faster block evaluation, e.g. // broadcasting). - static const bool BlockAccess = - TensorEvaluator::BlockAccess && - TensorEvaluator::PreferBlockAccess; - static const bool BlockAccessV2 = TensorEvaluator::BlockAccessV2 && TensorEvaluator::PreferBlockAccess; static const TiledEvaluation value = - BlockAccessV2 - ? TiledEvaluation::On - : (BlockAccess ? TiledEvaluation::Legacy : TiledEvaluation::Off); + BlockAccessV2 ? TiledEvaluation::On : TiledEvaluation::Off; }; template , Device> enum { IsAligned = false, PacketAccess = (PacketType::size > 1), - BlockAccess = true, BlockAccessV2 = true, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, @@ -183,60 +182,6 @@ struct TensorEvaluator, Device> Index count; }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( - TensorBlock* output_block) const { - if (NumDims <= 0) return; - - static const bool is_col_major = - static_cast(Layout) == static_cast(ColMajor); - - // Compute spatial coordinates for the first block element. - array coords; - extract_coordinates(output_block->first_coeff_index(), coords); - array initial_coords = coords; - - CoeffReturnType* data = output_block->data(); - Index offset = 0; - - // Initialize output block iterator state. Dimension in this array are - // always in inner_most -> outer_most order (col major layout). - array it; - for (Index i = 0; i < NumDims; ++i) { - const Index dim = is_col_major ? i : NumDims - 1 - i; - it[i].size = output_block->block_sizes()[dim]; - it[i].stride = output_block->block_strides()[dim]; - it[i].span = it[i].stride * (it[i].size - 1); - it[i].count = 0; - } - eigen_assert(it[0].stride == 1); - - while (it[NumDims - 1].count < it[NumDims - 1].size) { - // Generate data for the inner-most dimension. - for (Index i = 0; i < it[0].size; ++i) { - *(data + offset + i) = m_generator(coords); - coords[is_col_major ? 0 : NumDims - 1]++; - } - coords[is_col_major ? 0 : NumDims - 1] = - initial_coords[is_col_major ? 0 : NumDims - 1]; - - // For the 1d tensor we need to generate only one inner-most dimension. - if (NumDims == 1) break; - - // Update offset. - for (Index i = 1; i < NumDims; ++i) { - if (++it[i].count < it[i].size) { - offset += it[i].stride; - coords[is_col_major ? i : NumDims - 1 - i]++; - break; - } - if (i != NumDims - 1) it[i].count = 0; - coords[is_col_major ? i : NumDims - 1 - i] = - initial_coords[is_col_major ? i : NumDims - 1 - i]; - offset -= it[i].span; - } - } - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 38bf80c5d..49bc60f0a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -231,7 +231,6 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = true, BlockAccessV2 = false, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, @@ -541,139 +540,6 @@ struct TensorEvaluator, Device> internal::kSkewedInnerDims, block_total_size_max)); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( - OutputTensorBlock* output_block) const { - typedef internal::ImagePatchCopyOp ImagePatchCopyOp; - typedef internal::ImagePatchPaddingOp ImagePatchPaddingOp; - - // Calculate loop limits and various input/output dim sizes. - const DSizes& block_sizes = output_block->block_sizes(); - const bool col_major = - static_cast(Layout) == static_cast(ColMajor); - const Index depth_dim_size = block_sizes[col_major ? 0 : NumDims - 1]; - const Index output_depth_dim_size = - m_dimensions[col_major ? 0 : NumDims - 1]; - const Index row_dim_size = block_sizes[col_major ? 1 : NumDims - 2]; - const Index output_row_dim_size = m_dimensions[col_major ? 1 : NumDims - 2]; - const Index col_dim_size = block_sizes[col_major ? 2 : NumDims - 3]; - const Index block_col_stride = row_dim_size * depth_dim_size; - const Index patch_index_dim_size = block_sizes[col_major ? 3 : NumDims - 4]; - const Index outer_dim_size = - block_sizes.TotalSize() / - (depth_dim_size * row_dim_size * col_dim_size * patch_index_dim_size); - - const Index patch_size = row_dim_size * col_dim_size * depth_dim_size; - const Index batch_size = patch_size * patch_index_dim_size; - - Index output_index = output_block->first_coeff_index(); - - // Loop through outer dimensions. - for (Index outer_dim_index = 0; outer_dim_index < outer_dim_size; - ++outer_dim_index) { - const Index outer_output_base_index = outer_dim_index * batch_size; - // Find the offset of the element wrt the location of the first element. - const Index patchIndexStart = output_index / m_fastPatchStride; - const Index patchOffset = - (output_index - patchIndexStart * m_patchStride) / m_fastOutputDepth; - const Index colOffsetStart = patchOffset / m_fastColStride; - // Other ways to index this element. - const Index otherIndex = - (NumDims == 4) ? 0 : output_index / m_fastOtherStride; - const Index patch2DIndexStart = - (NumDims == 4) - ? 0 - : (output_index - otherIndex * m_otherStride) / m_fastPatchStride; - // Calculate starting depth index. - const Index depth = output_index - (output_index / m_fastOutputDepth) * - output_depth_dim_size; - const Index patch_input_base_index = - depth + otherIndex * m_patchInputStride; - - // Loop through patches. - for (Index patch_index_dim_index = 0; - patch_index_dim_index < patch_index_dim_size; - ++patch_index_dim_index) { - const Index patch_output_base_index = - outer_output_base_index + patch_index_dim_index * patch_size; - // Patch index corresponding to the passed in index. - const Index patchIndex = patchIndexStart + patch_index_dim_index; - const Index patch2DIndex = - (NumDims == 4) ? patchIndex - : patch2DIndexStart + patch_index_dim_index; - const Index colIndex = patch2DIndex / m_fastOutputRows; - const Index input_col_base = colIndex * m_col_strides; - const Index row_offset_base = - (patch2DIndex - colIndex * m_outputRows) * m_row_strides - - m_rowPaddingTop; - - // Loop through columns. - for (Index col_dim_index = 0; col_dim_index < col_dim_size; - ++col_dim_index) { - const Index col_output_base_index = - patch_output_base_index + col_dim_index * block_col_stride; - - // Calculate col index in the input original tensor. - Index colOffset = colOffsetStart + col_dim_index; - Index inputCol = - input_col_base + colOffset * m_in_col_strides - m_colPaddingLeft; - Index origInputCol = - (m_col_inflate_strides == 1) - ? inputCol - : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0); - - bool pad_column = false; - if (inputCol < 0 || inputCol >= m_input_cols_eff || - ((m_col_inflate_strides != 1) && - (inputCol != origInputCol * m_col_inflate_strides))) { - pad_column = true; - } - - const Index col_input_base_index = - patch_input_base_index + origInputCol * m_colInputStride; - const Index input_row_base = - row_offset_base + - ((patchOffset + col_dim_index * output_row_dim_size) - - colOffset * m_colStride) * - m_in_row_strides; - // Loop through rows. - for (Index row_dim_index = 0; row_dim_index < row_dim_size; - ++row_dim_index) { - const Index output_base_index = - col_output_base_index + row_dim_index * depth_dim_size; - bool pad_row = false; - Index inputIndex; - if (!pad_column) { - Index inputRow = - input_row_base + row_dim_index * m_in_row_strides; - Index origInputRow = - (m_row_inflate_strides == 1) - ? inputRow - : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) - : 0); - if (inputRow < 0 || inputRow >= m_input_rows_eff || - ((m_row_inflate_strides != 1) && - (inputRow != origInputRow * m_row_inflate_strides))) { - pad_row = true; - } else { - inputIndex = - col_input_base_index + origInputRow * m_rowInputStride; - } - } - // Copy (or pad) along depth dimension. - if (pad_column || pad_row) { - ImagePatchPaddingOp::Run(depth_dim_size, Scalar(m_paddingValue), - output_base_index, output_block->data()); - } else { - ImagePatchCopyOp::Run(*this, depth_dim_size, output_base_index, - output_block->data(), inputIndex); - } - } - } - } - output_index += m_otherStride; - } - } - protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index e1df84a1d..ef6b62620 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -92,7 +92,6 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index 1da7a4e23..695726e10 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -119,7 +119,6 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, @@ -199,7 +198,6 @@ template enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 781f1d75b..700337539 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -135,11 +135,6 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - // TODO(andydavis, wuke) Enable BlockAccess for the general case when the - // performance issue with block-based reshape is resolved. - BlockAccess = TensorEvaluator::BlockAccess && - TensorEvaluator::RawAccess && - NumInputDims > 0 && NumOutputDims > 0, // For trivial reshapes with raw access to underlying data we will provide // zero overhead block access. // TODO(ezhulenev): Consider adding block access without raw access? @@ -153,14 +148,6 @@ struct TensorEvaluator, Device> typedef typename internal::remove_const::type ScalarNoConst; - typedef internal::TensorBlock - InputTensorBlock; - typedef internal::TensorBlock - OutputTensorBlock; - typedef internal::TensorBlockReader - OutputTensorBlockReader; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// typedef internal::TensorBlockDescriptor TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; @@ -177,30 +164,6 @@ struct TensorEvaluator, Device> // The total size of the reshaped tensor must be equal to the total size // of the input tensor. eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); - - if (BlockAccess) { - const typename TensorEvaluator::Dimensions& input_dims = - m_impl.dimensions(); - if (static_cast(Layout) == static_cast(ColMajor)) { - m_outputStrides[0] = 1; - for (int i = 1; i < NumOutputDims; ++i) { - m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; - } - m_inputStrides[0] = 1; - for (int i = 1; i < NumInputDims; ++i) { - m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; - } - } else { - m_outputStrides[NumOutputDims - 1] = 1; - for (int i = NumOutputDims - 2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; - } - m_inputStrides[NumInputDims - 1] = 1; - for (int i = NumInputDims - 2; i >= 0; --i) { - m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; - } - } - } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -249,128 +212,6 @@ struct TensorEvaluator, Device> Index size; Index count; }; - // TODO(andydavis) Reduce the overhead of this function. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( - OutputTensorBlock* output_block) const { - if (m_impl.data() != NULL) { - OutputTensorBlockReader::Run(output_block, m_impl.data()); - return; - } - - // Calculate output block unit-stride inner dimension length. - const DSizes& output_block_sizes = - output_block->block_sizes(); - Index output_inner_dim_size = 1; - Index output_outer_dim_start = NumOutputDims; - for (Index i = 0; i < NumOutputDims; ++i) { - const Index dim = static_cast(Layout) == static_cast(ColMajor) - ? i : NumOutputDims - i - 1; - output_inner_dim_size *= output_block_sizes[dim]; - if (output_block_sizes[dim] < m_dimensions[dim]) { - output_outer_dim_start = i + 1; - break; - } - } - - // Initialize output block iterator state. - array block_iter_state; - - for (Index i = 0; i < NumOutputDims; ++i) { - const Index dim = static_cast(Layout) == static_cast(ColMajor) - ? i : NumOutputDims - i - 1; - block_iter_state[i].size = output_block_sizes[dim]; - block_iter_state[i].stride = m_outputStrides[dim]; - block_iter_state[i].span = - block_iter_state[i].stride * (block_iter_state[i].size - 1); - block_iter_state[i].count = 0; - } - - const Index output_outer_dim_size = output_block_sizes.TotalSize() / - output_inner_dim_size; - const typename TensorEvaluator::Dimensions& input_dims = - m_impl.dimensions(); - - Index index = output_block->first_coeff_index(); - for (Index outer_idx = 0; outer_idx < output_outer_dim_size; ++outer_idx) { - Index inner_idx = 0; - while (inner_idx < output_inner_dim_size) { - // Calculate input coords based on 'index'. - array input_coords; - Index idx = index; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumInputDims - 1; i > 0; --i) { - input_coords[i] = idx / m_inputStrides[i]; - idx -= input_coords[i] * m_inputStrides[i]; - } - input_coords[0] = idx; - } else { - for (int i = 0; i < NumInputDims - 1; ++i) { - input_coords[i] = idx / m_inputStrides[i]; - idx -= input_coords[i] * m_inputStrides[i]; - } - input_coords[NumInputDims - 1] = idx; - } - - // Calculate target input block shape, using at most - // 'output_inner_dim_size' coefficients along the input block's inner - // dimensions. - DSizes input_block_sizes; - Index num_to_allocate = output_inner_dim_size - inner_idx; - for (Index i = 0; i < NumInputDims; ++i) { - const Index dim = - static_cast(Layout) == static_cast(ColMajor) - ? i : NumInputDims - i - 1; - input_block_sizes[dim] = numext::mini( - num_to_allocate, (static_cast(input_dims[dim]) - - input_coords[dim])); - if (input_coords[dim] == 0) { - num_to_allocate /= input_block_sizes[dim]; - } else { - num_to_allocate = 1; - } - } - - // Calculate input block strides. - DSizes input_block_strides; - if (static_cast(Layout) == static_cast(ColMajor)) { - input_block_strides[0] = 1; - for (int i = 1; i < NumInputDims; ++i) { - input_block_strides[i] = input_block_strides[i - 1] * - input_block_sizes[i - 1]; - } - } else { - input_block_strides[NumInputDims - 1] = 1; - for (int i = NumInputDims - 2; i >= 0; --i) { - input_block_strides[i] = input_block_strides[i + 1] * - input_block_sizes[i + 1]; - } - } - - // Instantiate and read input block from input tensor. - InputTensorBlock input_block(index, input_block_sizes, - input_block_strides, m_inputStrides, - output_block->data() + outer_idx * - output_inner_dim_size + inner_idx); - - m_impl.block(&input_block); - - const Index input_block_total_size = input_block_sizes.TotalSize(); - index += input_block_total_size; - inner_idx += input_block_total_size; - } - eigen_assert(inner_idx == output_inner_dim_size); - index -= output_inner_dim_size; - // Update index. - for (Index i = output_outer_dim_start; i < NumOutputDims; ++i) { - if (++block_iter_state[i].count < block_iter_state[i].size) { - index += block_iter_state[i].stride; - break; - } - block_iter_state[i].count = 0; - index -= block_iter_state[i].span; - } - } - } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, @@ -408,8 +249,6 @@ struct TensorEvaluator, Device> protected: TensorEvaluator m_impl; NewDimensions m_dimensions; - DSizes m_outputStrides; - DSizes m_inputStrides; }; @@ -426,7 +265,6 @@ template enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, BlockAccessV2 = TensorEvaluator::RawAccess, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, @@ -619,7 +457,6 @@ struct TensorEvaluator, Devi // slice offsets and sizes. IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, BlockAccessV2 = TensorEvaluator::BlockAccessV2, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, @@ -714,7 +551,7 @@ struct TensorEvaluator, Devi } } // Use memcpy if it's going to be faster than using the regular evaluation. - const MemcpyTriggerForSlicing trigger(m_device); + const MemcpyTriggerForSlicing trigger(m_device); if (trigger(internal::array_prod(dimensions()), contiguous_values)) { EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data(); for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { @@ -808,16 +645,6 @@ struct TensorEvaluator, Devi m_impl.getResourceRequirements(resources); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( - TensorBlock* output_block) const { - TensorBlock input_block(srcCoeff(output_block->first_coeff_index()), - output_block->block_sizes(), - output_block->block_strides(), - TensorBlockDimensions(m_inputStrides), - output_block->data()); - m_impl.block(&input_block); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { @@ -922,7 +749,6 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, BlockAccessV2 = TensorEvaluator::BlockAccessV2, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, @@ -1124,7 +950,6 @@ struct TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, @@ -1306,7 +1131,6 @@ struct TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 1104f02c7..4a22922d9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -98,7 +98,6 @@ struct TensorEvaluator, Device enum { IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, BlockAccessV2 = TensorEvaluator::RawAccess, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 80afcff0f..4abe58ecd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -96,7 +96,6 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index d826cfb7e..84604cf41 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -584,7 +584,6 @@ struct TensorReductionEvaluatorBase::Layout, @@ -594,11 +593,6 @@ struct TensorReductionEvaluatorBase::type ScalarNoConst; - typedef internal::TensorBlock - OutputTensorBlock; - typedef internal::TensorBlock - InputTensorBlock; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// typedef internal::TensorBlockNotImplemented TensorBlockV2; //===--------------------------------------------------------------------===// @@ -920,258 +914,6 @@ struct TensorReductionEvaluatorBasefirst_coeff_index() == 0); - eigen_assert(output_block->block_sizes().TotalSize() == 1); - Op reducer(m_reducer); - output_block->data()[0] = internal::InnerMostDimReducer::reduce( - *this, 0, m_numValuesToReduce, reducer); - return; - } - - // Calculate input tensor 'slice' required to reduce output block coeffs. - DSizes input_slice_sizes(m_impl.dimensions()); - for (int i = 0; i < NumOutputDims; ++i) { - // Clip preserved input dimensions by output block size. - input_slice_sizes[m_output_to_input_dim_map[i]] = - output_block->block_sizes()[i]; - } - - // Shard input tensor slice into blocks (because it could be large if we - // need to reduce along several dimensions to calculate required output - // coefficients). - const Index max_coeff_count = - numext::mini(((m_device.firstLevelCacheSize()) / sizeof(Scalar)), - input_slice_sizes.TotalSize()); - - // Calculate max output shard size needed to keep working set of reducers - // in L1, while leaving enough space for reducer overhead and 'PacketSize' - // reductions. - DSizes target_input_block_sizes; - CalculateTargetInputBlockShape(max_coeff_count, input_slice_sizes, - &target_input_block_sizes); - // Calculate indices for first preserved dimension. - const Index first_preserved_dim_output_index = - static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumOutputDims - 1; - const Index first_preserved_dim_input_index = - m_output_to_input_dim_map[first_preserved_dim_output_index]; - const bool inner_most_dim_preserved = - PreservingInnerMostDims || - (first_preserved_dim_input_index == - (static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumInputDims - 1)); - - // Calculate output block inner/outer dimension sizes. - const Index output_block_inner_dim_size = - output_block->block_sizes()[first_preserved_dim_output_index]; - const Index output_block_outer_dim_size = - output_block->block_sizes().TotalSize() / output_block_inner_dim_size; - // Calculate shard size for first preserved dimension. - const Index output_shard_size = - target_input_block_sizes[first_preserved_dim_input_index]; - const Index num_output_shards = - (output_block_inner_dim_size + output_shard_size - 1) / - output_shard_size; - - // Initialize 'tensor_slice_offsets' from input coords of output index. - DSizes tensor_slice_offsets; - GetInputCoordsForOutputIndex(output_block->first_coeff_index(), - &tensor_slice_offsets); - - // Store tensor slice offset in first preserved dimension to be used - // to update tensor slice extents in loop below. - const Index first_preserved_dim_offset_start = - tensor_slice_offsets[first_preserved_dim_input_index]; - - array block_iter_state; - - // Initialize state used to iterate through output coefficients - // and update 'tensor_slice_offsets' in outer preserved dims. - for (int i = 0; i < NumOutputDims - 1; ++i) { - const int dim = static_cast(Layout) == static_cast(ColMajor) - ? i + 1 - : NumOutputDims - i - 2; - block_iter_state[i].input_dim = m_output_to_input_dim_map[dim]; - block_iter_state[i].output_size = output_block->block_sizes()[dim]; - block_iter_state[i].output_count = 0; - } - - // Allocate input block memory. - ScalarNoConst* input_block_data = static_cast( - m_device.allocate(max_coeff_count * sizeof(Scalar))); - // Allocate reducer memory. - const bool packet_reductions_enabled = - (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess); - const Index num_reducers = - (inner_most_dim_preserved && packet_reductions_enabled) - ? (output_shard_size / PacketSize + output_shard_size % PacketSize + - PacketSize) - : output_shard_size; - typedef internal::BlockReducer BlockReducer; - BlockReducer* reducers = static_cast( - m_device.allocate(num_reducers * sizeof(BlockReducer))); - - InputDimensions input_tensor_dims(m_impl.dimensions()); - for (Index output_outer_index = 0; - output_outer_index < output_block_outer_dim_size; - ++output_outer_index) { - for (Index output_shard_index = 0; output_shard_index < num_output_shards; - ++output_shard_index) { - // Initialize 'tensor_slice_extents' for this output shard. - DSizes tensor_slice_extents(input_slice_sizes); - for (int i = 0; i < NumInputDims; ++i) { - if (i == first_preserved_dim_input_index) { - // Clip first preserved dim size to output shard size. - tensor_slice_extents[i] = numext::mini( - output_shard_size, - input_slice_sizes[i] - (tensor_slice_offsets[i] - - first_preserved_dim_offset_start)); - - } else if (!m_reduced[i]) { - // Clip outer preserved dims to size 1, so that we reduce a - // contiguous set of output coefficients. - tensor_slice_extents[i] = 1; - } - } - - // Initialize output coefficient reducers. - for (int i = 0; i < num_reducers; ++i) { - new (&reducers[i]) BlockReducer(m_reducer); - } - - typedef internal::TensorSliceBlockMapper - TensorSliceBlockMapper; - - // TODO(andydavis) Consider removing 'input_block_stride_order' if we - // find that scattered reads are not worth supporting in - // TensorSliceBlockMapper. - TensorSliceBlockMapper block_mapper( - typename TensorSliceBlockMapper::Dimensions(input_tensor_dims), - tensor_slice_offsets, tensor_slice_extents, - target_input_block_sizes, DimensionList()); - - const Index num_outputs_to_update = - tensor_slice_extents[first_preserved_dim_input_index]; - const Index preserved_dim_vector_reducer_count = - (inner_most_dim_preserved && packet_reductions_enabled) - ? num_outputs_to_update / PacketSize - : 0; - const Index preserved_dim_vector_coeff_count = - inner_most_dim_preserved - ? preserved_dim_vector_reducer_count * PacketSize - : 0; - const Index preserved_dim_reducer_limit = - (inner_most_dim_preserved && packet_reductions_enabled) - ? (preserved_dim_vector_reducer_count + - num_outputs_to_update % PacketSize) - : num_outputs_to_update; - - const Index total_block_count = block_mapper.total_block_count(); - for (Index b = 0; b < total_block_count; ++b) { - InputTensorBlock input_block = - block_mapper.GetBlockForIndex(b, input_block_data); - // Read. - m_impl.block(&input_block); - - Index num_values_to_reduce = 1; - for (Index i = 0; i < NumInputDims; ++i) { - if (m_reduced[i]) { - num_values_to_reduce *= input_block.block_sizes()[i]; - } - } - // Reduce. - if (inner_most_dim_preserved) { - const Index input_outer_dim_size = - input_block.block_sizes().TotalSize() / num_outputs_to_update; - for (Index input_outer_dim_index = 0; - input_outer_dim_index < input_outer_dim_size; - ++input_outer_dim_index) { - const Index input_outer_dim_base = - input_outer_dim_index * num_outputs_to_update; - for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) { - reducers[i].Reduce(input_outer_dim_base + i * PacketSize, - PacketSize, input_block.data()); - } - const Index scalar_reducer_base = - input_outer_dim_base + preserved_dim_vector_coeff_count; - for (Index i = preserved_dim_vector_reducer_count; - i < preserved_dim_reducer_limit; ++i) { - reducers[i].Reduce(scalar_reducer_base + i - - preserved_dim_vector_reducer_count, - 1, input_block.data()); - } - } - } else { - for (Index i = 0; i < num_outputs_to_update; ++i) { - reducers[i].Reduce(i * num_values_to_reduce, num_values_to_reduce, - input_block.data()); - } - } - } - - // Finalize all reducers for this output shard. - const Index output_base_index = - output_outer_index * output_block_inner_dim_size + - output_shard_index * output_shard_size; - if (inner_most_dim_preserved) { - EIGEN_ALIGN_MAX - typename internal::remove_const::type - values[PacketSize]; - for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) { - const Index reducer_base = output_base_index + i * PacketSize; - internal::pstore( - values, reducers[i].FinalizePacket()); - for (Index j = 0; j < PacketSize; ++j) { - output_block->data()[reducer_base + j] = values[j]; - } - } - const Index scalar_reducer_base = - output_base_index + preserved_dim_vector_coeff_count; - - for (Index i = preserved_dim_vector_reducer_count; - i < preserved_dim_reducer_limit; ++i) { - output_block->data()[scalar_reducer_base + i - - preserved_dim_vector_reducer_count] = - reducers[i].Finalize(); - } - } else { - for (int i = 0; i < num_outputs_to_update; ++i) { - output_block->data()[output_base_index + i] = - reducers[i].Finalize(); - } - } - - // Update 'tensor_slice_offsets' by num outputs for this output shard. - tensor_slice_offsets[first_preserved_dim_input_index] += - num_outputs_to_update; - } - // Update slice offset for inner preserved dim. - tensor_slice_offsets[first_preserved_dim_input_index] -= - output_block_inner_dim_size; - // Update slice offsets for remaining output dims. - for (int i = 0; i < NumOutputDims - 1; ++i) { - BlockIteratorState& b = block_iter_state[i]; - if (++b.output_count < b.output_size) { - ++tensor_slice_offsets[b.input_dim]; - break; - } - b.output_count = 0; - tensor_slice_offsets[b.input_dim] -= b.output_size - 1; - } - } - - // Free memory. - m_device.deallocate(input_block_data); - m_device.deallocate(reducers); - } - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } EIGEN_DEVICE_FUNC const Device& device() const { return m_device; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index 87072006d..ff5bfad46 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -141,7 +141,6 @@ template class TensorRef : public TensorBase, Device> enum { IsAligned = false, PacketAccess = false, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorRef::Layout, @@ -432,7 +430,6 @@ struct TensorEvaluator, Device> : public TensorEvaluator, Device enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = true, BlockAccessV2 = NumDims > 0, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, @@ -248,112 +247,6 @@ struct TensorEvaluator, Device internal::kSkewedInnerDims, block_total_size_max)); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( - OutputTensorBlock* output_block) const { - if (NumDims <= 0) return; - - // TODO(ezhulenev): If underlying tensor expression supports and prefers - // block evaluation we must use it. Currently we use coeff and packet - // access into the underlying tensor expression. - // static const bool useBlockAccessForArgType = - // TensorEvaluator::BlockAccess && - // TensorEvaluator::PreferBlockAccess; - - static const bool isColMajor = - static_cast(Layout) == static_cast(ColMajor); - - static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1; - const bool inner_dim_reversed = m_reverse[inner_dim_idx]; - - CoeffReturnType* data = output_block->data(); - Index block_offset = 0; - - Index input_offset = reverseIndex(output_block->first_coeff_index()); - - // Initialize output block iterator state. Dimension in this array are - // always in inner_most -> outer_most order (col major layout). - array it; - for (Index i = 0; i < NumDims; ++i) { - const Index dim = isColMajor ? i : NumDims - 1 - i; - it[i].size = output_block->block_sizes()[dim]; - it[i].count = 0; - it[i].reverse = m_reverse[dim]; - - it[i].block_stride = output_block->block_strides()[dim]; - it[i].block_span = it[i].block_stride * (it[i].size - 1); - - it[i].input_stride = m_strides[dim]; - it[i].input_span = it[i].input_stride * (it[i].size - 1); - - if (it[i].reverse) { - it[i].input_stride = -1 * it[i].input_stride; - it[i].input_span = -1 * it[i].input_span; - } - } - - // If multiple inner dimensions have the same reverse flag, check if we can - // merge them into a single virtual inner dimension. - int effective_inner_dim = 0; - for (int i = 1; i < NumDims; ++i) { - if (it[i].reverse != it[effective_inner_dim].reverse) break; - if (it[i].block_stride != it[effective_inner_dim].size) break; - if (it[i].block_stride != numext::abs(it[i].input_stride)) break; - - it[i].size = it[effective_inner_dim].size * it[i].size; - - it[i].block_stride = 1; - it[i].input_stride = (inner_dim_reversed ? -1 : 1); - - it[i].block_span = it[i].block_stride * (it[i].size - 1); - it[i].input_span = it[i].input_stride * (it[i].size - 1); - - effective_inner_dim = i; - } - - eigen_assert(it[effective_inner_dim].block_stride == 1); - eigen_assert(it[effective_inner_dim].input_stride == - (inner_dim_reversed ? -1 : 1)); - - const Index inner_dim_size = it[effective_inner_dim].size; - - while (it[NumDims - 1].count < it[NumDims - 1].size) { - // Copy inner-most dimension data from reversed location in input. - Index dst = block_offset; - Index src = input_offset; - - // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed - // worse results in benchmarks than a simple coefficient loop. - if (inner_dim_reversed) { - for (Index i = 0; i < inner_dim_size; ++i) { - data[dst] = m_impl.coeff(src); - ++dst; - --src; - } - } else { - for (Index i = 0; i < inner_dim_size; ++i) { - data[dst] = m_impl.coeff(src); - ++dst; - ++src; - } - } - - // For the 1d tensor we need to generate only one inner-most dimension. - if ((NumDims - effective_inner_dim) == 1) break; - - // Update offset. - for (Index i = effective_inner_dim + 1; i < NumDims; ++i) { - if (++it[i].count < it[i].size) { - block_offset += it[i].block_stride; - input_offset += it[i].input_stride; - break; - } - if (i != NumDims - 1) it[i].count = 0; - block_offset -= it[i].block_span; - input_offset -= it[i].input_span; - } - } - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool /*root_of_expr_ast*/ = false) const { @@ -535,7 +428,6 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 1e6fc93b1..d8005d604 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -99,7 +99,6 @@ struct TensorEvaluator, Device> { enum { IsAligned = false, PacketAccess = (PacketType::size > 1), - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index df4cd1eb3..72c43a39d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -115,7 +115,6 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = (PacketType::size > 1), - BlockAccess = TensorEvaluator::BlockAccess, BlockAccessV2 = TensorEvaluator::RawAccess, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, @@ -125,11 +124,6 @@ struct TensorEvaluator, Device> typedef typename internal::remove_const::type ScalarNoConst; - typedef internal::TensorBlock - TensorBlock; - typedef internal::TensorBlockReader - TensorBlockReader; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// typedef internal::TensorBlockDescriptor TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; @@ -249,98 +243,6 @@ struct TensorEvaluator, Device> internal::kUniformAllDims, block_total_size_max)); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( - TensorBlock* output_block) const { - if (m_impl.data() != NULL) { - // Fast path: we have direct access to the data, so shuffle as we read. - TensorBlockReader::Run(output_block, - srcCoeff(output_block->first_coeff_index()), - m_inverseShuffle, - m_unshuffledInputStrides, - m_impl.data()); - return; - } - - // Slow path: read unshuffled block from the input and shuffle in-place. - // Initialize input block sizes using input-to-output shuffle map. - DSizes input_block_sizes; - for (Index i = 0; i < NumDims; ++i) { - input_block_sizes[i] = output_block->block_sizes()[m_inverseShuffle[i]]; - } - - // Calculate input block strides. - DSizes input_block_strides; - if (static_cast(Layout) == static_cast(ColMajor)) { - input_block_strides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - input_block_strides[i] = - input_block_strides[i - 1] * input_block_sizes[i - 1]; - } - } else { - input_block_strides[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - input_block_strides[i] = - input_block_strides[i + 1] * input_block_sizes[i + 1]; - } - } - DSizes, NumDims> fast_input_block_strides; - for (int i = 0; i < NumDims; ++i) { - fast_input_block_strides[i] = - internal::TensorIntDivisor(input_block_strides[i]); - } - - // Read input block. - TensorBlock input_block(srcCoeff(output_block->first_coeff_index()), - input_block_sizes, - input_block_strides, - Dimensions(m_unshuffledInputStrides), - output_block->data()); - - m_impl.block(&input_block); - - // Naive In-place shuffle: random IO but block size is O(L1 cache size). - // TODO(andydavis) Improve the performance of this in-place shuffle. - const Index total_size = input_block_sizes.TotalSize(); - std::vector bitmap(total_size, false); - ScalarNoConst* data = const_cast(output_block->data()); - const DSizes& output_block_strides = - output_block->block_strides(); - for (Index input_index = 0; input_index < total_size; ++input_index) { - if (bitmap[input_index]) { - // Coefficient at this index has already been shuffled. - continue; - } - - Index output_index = - GetBlockOutputIndex(input_index, input_block_strides, - output_block_strides, fast_input_block_strides); - if (output_index == input_index) { - // Coefficient already in place. - bitmap[output_index] = true; - continue; - } - - // The following loop starts at 'input_index', and shuffles - // coefficients into their shuffled location at 'output_index'. - // It skips through the array shuffling coefficients by following - // the shuffle cycle starting and ending a 'start_index'. - ScalarNoConst evicted_value; - ScalarNoConst shuffled_value = data[input_index]; - do { - evicted_value = data[output_index]; - data[output_index] = shuffled_value; - shuffled_value = evicted_value; - bitmap[output_index] = true; - output_index = - GetBlockOutputIndex(output_index, input_block_strides, - output_block_strides, fast_input_block_strides); - } while (output_index != input_index); - - data[output_index] = shuffled_value; - bitmap[output_index] = true; - } - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, bool root_of_expr_ast = false) const { @@ -462,7 +364,6 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = (PacketType::size > 1), - BlockAccess = TensorEvaluator::BlockAccess, BlockAccessV2 = TensorEvaluator::RawAccess, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, @@ -471,11 +372,6 @@ struct TensorEvaluator, Device> typedef typename internal::remove_const::type ScalarNoConst; - typedef internal::TensorBlock - TensorBlock; - typedef internal::TensorBlockWriter - TensorBlockWriter; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// typedef internal::TensorBlockDescriptor TensorBlockDesc; //===--------------------------------------------------------------------===// @@ -502,15 +398,7 @@ struct TensorEvaluator, Device> } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( - const TensorBlock& block) { - eigen_assert(this->m_impl.data() != NULL); - TensorBlockWriter::Run(block, this->srcCoeff(block.first_coeff_index()), - this->m_inverseShuffle, - this->m_unshuffledInputStrides, this->m_impl.data()); - } - -template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2( const TensorBlockDesc& desc, const TensorBlockV2& block) { eigen_assert(this->m_impl.data() != NULL); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 061bf6bdf..8a7fcac23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -114,7 +114,6 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, @@ -288,7 +287,6 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h index 676717d8d..209d6fb3b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -97,7 +97,6 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index ced963175..a4c38f118 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -183,7 +183,6 @@ struct TensorEvaluator, D enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, BlockAccessV2 = false, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp index 0fb189e09..8d3ca84c8 100644 --- a/unsupported/test/cxx11_tensor_block_access.cpp +++ b/unsupported/test/cxx11_tensor_block_access.cpp @@ -46,22 +46,6 @@ static DSizes RandomDims() { return DSizes(dims); } -/** Dummy data type to test TensorBlock copy ops. */ -struct Data { - Data() : value(0) {} - explicit Data(int v) : value(v) { } - int value; -}; - -bool operator==(const Data& lhs, const Data& rhs) { - return lhs.value == rhs.value; -} - -std::ostream& operator<<(std::ostream& os, const Data& d) { - os << "Data: value=" << d.value; - return os; -} - template static T* GenerateRandomData(const Index& size) { T* data = new T[size]; @@ -71,15 +55,6 @@ static T* GenerateRandomData(const Index& size) { return data; } -template <> -Data* GenerateRandomData(const Index& size) { - Data* data = new Data[size]; - for (int i = 0; i < size; ++i) { - data[i] = Data(internal::random(1, 100)); - } - return data; -} - template static void Debug(DSizes dims) { for (int i = 0; i < NumDims; ++i) { @@ -183,84 +158,6 @@ static void test_block_mapper_maps_every_element() { VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1); } -template -static void test_slice_block_mapper_maps_every_element() { - typedef internal::TensorBlock TensorBlock; - typedef internal::TensorSliceBlockMapper TensorSliceBlockMapper; - - DSizes tensor_dims = RandomDims(); - DSizes tensor_slice_offsets = RandomDims(); - DSizes tensor_slice_extents = RandomDims(); - - // Make sure that tensor offsets + extents do not overflow. - for (int i = 0; i < NumDims; ++i) { - tensor_slice_offsets[i] = - numext::mini(tensor_dims[i] - 1, tensor_slice_offsets[i]); - tensor_slice_extents[i] = numext::mini( - tensor_slice_extents[i], tensor_dims[i] - tensor_slice_offsets[i]); - } - - // Keep track of elements indices available via block access. - std::set coeff_set; - - int total_coeffs = static_cast(tensor_slice_extents.TotalSize()); - - // Pick a random dimension sizes for the tensor blocks. - DSizes block_sizes; - for (int i = 0; i < NumDims; ++i) { - block_sizes[i] = internal::random(1, tensor_slice_extents[i]); - } - - TensorSliceBlockMapper block_mapper(tensor_dims, tensor_slice_offsets, - tensor_slice_extents, block_sizes, - DimensionList()); - - for (int i = 0; i < block_mapper.total_block_count(); ++i) { - TensorBlock block = block_mapper.GetBlockForIndex(i, NULL); - UpdateCoeffSet(block, block.first_coeff_index(), - choose(Layout, NumDims - 1, 0), - &coeff_set); - } - - VERIFY_IS_EQUAL(Index(coeff_set.size()), total_coeffs); -} - -template -static void test_block_io_copy_data_from_source_to_target() { - typedef internal::TensorBlock TensorBlock; - typedef internal::TensorBlockMapper - TensorBlockMapper; - - typedef internal::TensorBlockReader - TensorBlockReader; - typedef internal::TensorBlockWriter - TensorBlockWriter; - - DSizes input_tensor_dims = RandomDims(); - const Index input_tensor_size = input_tensor_dims.TotalSize(); - - T* input_data = GenerateRandomData(input_tensor_size); - T* output_data = new T[input_tensor_size]; - - TensorBlockMapper block_mapper(input_tensor_dims, RandomShape(), - RandomTargetSize(input_tensor_dims)); - T* block_data = new T[block_mapper.block_dims_total_size()]; - - for (int i = 0; i < block_mapper.total_block_count(); ++i) { - TensorBlock block = block_mapper.GetBlockForIndex(i, block_data); - TensorBlockReader::Run(&block, input_data); - TensorBlockWriter::Run(block, output_data); - } - - for (int i = 0; i < input_tensor_size; ++i) { - VERIFY_IS_EQUAL(input_data[i], output_data[i]); - } - - delete[] input_data; - delete[] output_data; - delete[] block_data; -} - template static Index GetInputIndex(Index output_index, const array& output_to_input_dim_map, @@ -304,179 +201,6 @@ static array ComputeStrides( return strides; } -template -static void test_block_io_copy_using_reordered_dimensions() { - typedef internal::TensorBlock TensorBlock; - typedef internal::TensorBlockMapper - TensorBlockMapper; - - typedef internal::TensorBlockReader - TensorBlockReader; - typedef internal::TensorBlockWriter - TensorBlockWriter; - - DSizes input_tensor_dims = RandomDims(); - const Index input_tensor_size = input_tensor_dims.TotalSize(); - - // Create a random input tensor. - T* input_data = GenerateRandomData(input_tensor_size); - - // Create a random dimension re-ordering/shuffle. - std::vector shuffle; - for (int i = 0; i < NumDims; ++i) shuffle.push_back(i); - std::random_shuffle(shuffle.begin(), shuffle.end()); - - DSizes output_tensor_dims; - array input_to_output_dim_map; - array output_to_input_dim_map; - for (Index i = 0; i < NumDims; ++i) { - output_tensor_dims[shuffle[i]] = input_tensor_dims[i]; - input_to_output_dim_map[i] = shuffle[i]; - output_to_input_dim_map[shuffle[i]] = i; - } - - // Random block shape and size. - TensorBlockMapper block_mapper(output_tensor_dims, RandomShape(), - RandomTargetSize(input_tensor_dims)); - - T* block_data = new T[block_mapper.block_dims_total_size()]; - T* output_data = new T[input_tensor_size]; - - array input_tensor_strides = - ComputeStrides(input_tensor_dims); - array output_tensor_strides = - ComputeStrides(output_tensor_dims); - - for (Index i = 0; i < block_mapper.total_block_count(); ++i) { - TensorBlock block = block_mapper.GetBlockForIndex(i, block_data); - const Index first_coeff_index = GetInputIndex( - block.first_coeff_index(), output_to_input_dim_map, - input_tensor_strides, output_tensor_strides); - TensorBlockReader::Run(&block, first_coeff_index, input_to_output_dim_map, - input_tensor_strides, input_data); - TensorBlockWriter::Run(block, first_coeff_index, input_to_output_dim_map, - input_tensor_strides, output_data); - } - - for (int i = 0; i < input_tensor_size; ++i) { - VERIFY_IS_EQUAL(input_data[i], output_data[i]); - } - - delete[] input_data; - delete[] block_data; - delete[] output_data; -} - -// This is the special case for reading data with reordering, when dimensions -// before/after reordering are the same. Squeezing reads along inner dimensions -// in this case is illegal, because we reorder innermost dimension. -template -static void test_block_io_copy_using_reordered_dimensions_do_not_squeeze() -{ - typedef internal::TensorBlock TensorBlock; - typedef internal::TensorBlockReader - TensorBlockReader; - - DSizes tensor_dims; - tensor_dims[0] = 7; - tensor_dims[1] = 9; - tensor_dims[2] = 7; - - DSizes block_dims = tensor_dims; - - DSizes tensor_to_block_dim_map; - tensor_to_block_dim_map[0] = 2; - tensor_to_block_dim_map[1] = 1; - tensor_to_block_dim_map[2] = 0; - - DSizes tensor_strides(ComputeStrides(tensor_dims)); - DSizes block_strides(ComputeStrides(block_dims)); - - const Index tensor_size = tensor_dims.TotalSize(); - float* tensor_data = GenerateRandomData(tensor_size); - float* block_data = new float[tensor_size]; - - TensorBlock block(0, block_dims, block_strides, tensor_strides, block_data); - TensorBlockReader::Run(&block, - 0, - tensor_to_block_dim_map, - tensor_strides, - tensor_data); - - TensorMap > block_tensor(block_data, block_dims); - TensorMap > tensor_tensor(tensor_data, tensor_dims); - - for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) { - for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) { - for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) { - float block_value = block_tensor(d2, d1, d0); - float tensor_value = tensor_tensor(d0, d1, d2); - VERIFY_IS_EQUAL(block_value, tensor_value); - } - } - } - - delete[] block_data; - delete[] tensor_data; -} - -// This is the special case for reading data with reordering, when dimensions -// before/after reordering are the same. Squeezing reads in this case is allowed -// because we reorder outer dimensions. -template -static void test_block_io_copy_using_reordered_dimensions_squeeze() -{ - typedef internal::TensorBlock TensorBlock; - typedef internal::TensorBlockReader - TensorBlockReader; - - DSizes tensor_dims; - tensor_dims[0] = 7; - tensor_dims[1] = 5; - tensor_dims[2] = 9; - tensor_dims[3] = 9; - - DSizes block_dims = tensor_dims; - - DSizes tensor_to_block_dim_map; - tensor_to_block_dim_map[0] = 0; - tensor_to_block_dim_map[1] = 1; - tensor_to_block_dim_map[2] = 3; - tensor_to_block_dim_map[3] = 2; - - DSizes tensor_strides(ComputeStrides(tensor_dims)); - DSizes block_strides(ComputeStrides(block_dims)); - - const Index tensor_size = tensor_dims.TotalSize(); - float* tensor_data = GenerateRandomData(tensor_size); - float* block_data = new float[tensor_size]; - - TensorBlock block(0, block_dims, block_strides, tensor_strides, block_data); - TensorBlockReader::Run(&block, - 0, - tensor_to_block_dim_map, - tensor_strides, - tensor_data); - - TensorMap > block_tensor(block_data, block_dims); - TensorMap > tensor_tensor(tensor_data, tensor_dims); - - for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) { - for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) { - for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) { - for (Index d3 = 0; d3 < tensor_dims[3]; ++d3) { - float block_value = block_tensor(d0, d1, d3, d2); - float tensor_value = tensor_tensor(d0, d1, d2, d3); - VERIFY_IS_EQUAL(block_value, tensor_value); - } - } - } - } - - delete[] block_data; - delete[] tensor_data; -} - template class EqualityChecker { @@ -510,365 +234,6 @@ public: } }; -template -static void test_block_io_zero_stride() -{ - typedef internal::TensorBlock TensorBlock; - typedef internal::TensorBlockReader - TensorBlockReader; - typedef internal::TensorBlockWriter - TensorBlockWriter; - - DSizes rnd_dims = RandomDims<5>(); - - DSizes input_tensor_dims = rnd_dims; - input_tensor_dims[0] = 1; - input_tensor_dims[2] = 1; - input_tensor_dims[4] = 1; - const Index input_tensor_size = input_tensor_dims.TotalSize(); - float* input_data = GenerateRandomData(input_tensor_size); - - DSizes output_tensor_dims = rnd_dims; - - DSizes input_tensor_strides( - ComputeStrides(input_tensor_dims)); - DSizes output_tensor_strides( - ComputeStrides(output_tensor_dims)); - - DSizes input_tensor_strides_with_zeros(input_tensor_strides); - input_tensor_strides_with_zeros[0] = 0; - input_tensor_strides_with_zeros[2] = 0; - input_tensor_strides_with_zeros[4] = 0; - - // Verify that data was correctly read/written from/into the block. - const EqualityChecker verify_is_equal(input_data, input_tensor_dims, input_tensor_strides, output_tensor_dims, output_tensor_strides); - - { - float* output_data = new float[output_tensor_dims.TotalSize()]; - TensorBlock read_block(0, output_tensor_dims, output_tensor_strides, - input_tensor_strides_with_zeros, output_data); - TensorBlockReader::Run(&read_block, input_data); - verify_is_equal(output_data); - delete[] output_data; - } - - { - float* output_data = new float[output_tensor_dims.TotalSize()]; - TensorBlock write_block(0, output_tensor_dims, - input_tensor_strides_with_zeros, - output_tensor_strides, input_data); - TensorBlockWriter::Run(write_block, output_data); - verify_is_equal(output_data); - delete[] output_data; - } - - delete[] input_data; -} - -template -static void test_block_io_squeeze_ones() { - typedef internal::TensorBlock TensorBlock; - typedef internal::TensorBlockReader - TensorBlockReader; - typedef internal::TensorBlockWriter - TensorBlockWriter; - - // Total size > 1. - { - DSizes block_sizes(1, 2, 1, 2, 1); - const Index total_size = block_sizes.TotalSize(); - - // Create a random input tensor. - float* input_data = GenerateRandomData(total_size); - DSizes strides(ComputeStrides(block_sizes)); - - { - float* output_data = new float[block_sizes.TotalSize()]; - TensorBlock read_block(0, block_sizes, strides, strides, output_data); - TensorBlockReader::Run(&read_block, input_data); - for (int i = 0; i < total_size; ++i) { - VERIFY_IS_EQUAL(output_data[i], input_data[i]); - } - delete[] output_data; - } - - { - float* output_data = new float[block_sizes.TotalSize()]; - TensorBlock write_block(0, block_sizes, strides, strides, input_data); - TensorBlockWriter::Run(write_block, output_data); - for (int i = 0; i < total_size; ++i) { - VERIFY_IS_EQUAL(output_data[i], input_data[i]); - } - delete[] output_data; - } - } - - // Total size == 1. - { - DSizes block_sizes(1, 1, 1, 1, 1); - const Index total_size = block_sizes.TotalSize(); - - // Create a random input tensor. - float* input_data = GenerateRandomData(total_size); - DSizes strides(ComputeStrides(block_sizes)); - - { - float* output_data = new float[block_sizes.TotalSize()]; - TensorBlock read_block(0, block_sizes, strides, strides, output_data); - TensorBlockReader::Run(&read_block, input_data); - for (int i = 0; i < total_size; ++i) { - VERIFY_IS_EQUAL(output_data[i], input_data[i]); - } - delete[] output_data; - } - - { - float* output_data = new float[block_sizes.TotalSize()]; - TensorBlock write_block(0, block_sizes, strides, strides, input_data); - TensorBlockWriter::Run(write_block, output_data); - for (int i = 0; i < total_size; ++i) { - VERIFY_IS_EQUAL(output_data[i], input_data[i]); - } - delete[] output_data; - } - } -} - -template -static void test_block_cwise_unary_io_basic() { - typedef internal::scalar_square_op UnaryFunctor; - typedef internal::TensorBlockCwiseUnaryIO - TensorBlockCwiseUnaryIO; - - DSizes block_sizes = RandomDims(); - DSizes strides(ComputeStrides(block_sizes)); - - const Index total_size = block_sizes.TotalSize(); - - // Create a random input tensors. - T* input_data = GenerateRandomData(total_size); - - T* output_data = new T[total_size]; - UnaryFunctor functor; - TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data, - strides, input_data); - for (int i = 0; i < total_size; ++i) { - VERIFY_IS_EQUAL(output_data[i], functor(input_data[i])); - } - - delete[] input_data; - delete[] output_data; -} - -template -static void test_block_cwise_unary_io_squeeze_ones() { - typedef internal::scalar_square_op UnaryFunctor; - typedef internal::TensorBlockCwiseUnaryIO - TensorBlockCwiseUnaryIO; - - DSizes block_sizes(1, 2, 1, 3, 1); - DSizes strides(ComputeStrides(block_sizes)); - - const Index total_size = block_sizes.TotalSize(); - - // Create a random input tensors. - float* input_data = GenerateRandomData(total_size); - - float* output_data = new float[total_size]; - UnaryFunctor functor; - TensorBlockCwiseUnaryIO::Run(functor, block_sizes, strides, output_data, - strides, input_data); - for (int i = 0; i < total_size; ++i) { - VERIFY_IS_EQUAL(output_data[i], functor(input_data[i])); - } - - delete[] input_data; - delete[] output_data; -} - -template -static void test_block_cwise_unary_io_zero_strides() { - typedef internal::scalar_square_op UnaryFunctor; - typedef internal::TensorBlockCwiseUnaryIO - TensorBlockCwiseUnaryIO; - - DSizes rnd_dims = RandomDims<5>(); - - DSizes input_sizes = rnd_dims; - input_sizes[0] = 1; - input_sizes[2] = 1; - input_sizes[4] = 1; - - DSizes input_strides(ComputeStrides(input_sizes)); - input_strides[0] = 0; - input_strides[2] = 0; - input_strides[4] = 0; - - // Generate random data. - float* input_data = GenerateRandomData(input_sizes.TotalSize()); - - DSizes output_sizes = rnd_dims; - DSizes output_strides(ComputeStrides(output_sizes)); - - const Index output_total_size = output_sizes.TotalSize(); - float* output_data = new float[output_total_size]; - - UnaryFunctor functor; - TensorBlockCwiseUnaryIO::Run(functor, output_sizes, output_strides, - output_data, input_strides, input_data); - for (int i = 0; i < rnd_dims[0]; ++i) { - for (int j = 0; j < rnd_dims[1]; ++j) { - for (int k = 0; k < rnd_dims[2]; ++k) { - for (int l = 0; l < rnd_dims[3]; ++l) { - for (int m = 0; m < rnd_dims[4]; ++m) { - Index output_index = i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4]; - Index input_index = i * input_strides[0] + j * input_strides[1] + - k * input_strides[2] + l * input_strides[3] + - m * input_strides[4]; - VERIFY_IS_EQUAL(output_data[output_index], - functor(input_data[input_index])); - } - } - } - } - } - - delete[] input_data; - delete[] output_data; -} - -template -static void test_block_cwise_binary_io_basic() { - typedef internal::scalar_sum_op BinaryFunctor; - typedef internal::TensorBlockCwiseBinaryIO - TensorBlockCwiseBinaryIO; - - DSizes block_sizes = RandomDims(); - DSizes strides(ComputeStrides(block_sizes)); - - const Index total_size = block_sizes.TotalSize(); - - // Create a random input tensors. - T* left_data = GenerateRandomData(total_size); - T* right_data = GenerateRandomData(total_size); - - T* output_data = new T[total_size]; - BinaryFunctor functor; - TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data, - strides, left_data, strides, right_data); - for (int i = 0; i < total_size; ++i) { - VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i])); - } - - delete[] left_data; - delete[] right_data; - delete[] output_data; -} - -template -static void test_block_cwise_binary_io_squeeze_ones() { - typedef internal::scalar_sum_op BinaryFunctor; - typedef internal::TensorBlockCwiseBinaryIO - TensorBlockCwiseBinaryIO; - - DSizes block_sizes(1, 2, 1, 3, 1); - DSizes strides(ComputeStrides(block_sizes)); - - const Index total_size = block_sizes.TotalSize(); - - // Create a random input tensors. - float* left_data = GenerateRandomData(total_size); - float* right_data = GenerateRandomData(total_size); - - float* output_data = new float[total_size]; - BinaryFunctor functor; - TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data, - strides, left_data, strides, right_data); - for (int i = 0; i < total_size; ++i) { - VERIFY_IS_EQUAL(output_data[i], functor(left_data[i], right_data[i])); - } - - delete[] left_data; - delete[] right_data; - delete[] output_data; -} - -template -static void test_block_cwise_binary_io_zero_strides() { - typedef internal::scalar_sum_op BinaryFunctor; - typedef internal::TensorBlockCwiseBinaryIO - TensorBlockCwiseBinaryIO; - - DSizes rnd_dims = RandomDims<5>(); - - DSizes left_sizes = rnd_dims; - left_sizes[0] = 1; - left_sizes[2] = 1; - left_sizes[4] = 1; - - DSizes left_strides(ComputeStrides(left_sizes)); - left_strides[0] = 0; - left_strides[2] = 0; - left_strides[4] = 0; - - DSizes right_sizes = rnd_dims; - right_sizes[1] = 1; - right_sizes[3] = 1; - - DSizes right_strides(ComputeStrides(right_sizes)); - right_strides[1] = 0; - right_strides[3] = 0; - - // Generate random data. - float* left_data = GenerateRandomData(left_sizes.TotalSize()); - float* right_data = GenerateRandomData(right_sizes.TotalSize()); - - DSizes output_sizes = rnd_dims; - DSizes output_strides(ComputeStrides(output_sizes)); - - const Index output_total_size = output_sizes.TotalSize(); - float* output_data = new float[output_total_size]; - - BinaryFunctor functor; - TensorBlockCwiseBinaryIO::Run(functor, output_sizes, output_strides, - output_data, left_strides, left_data, - right_strides, right_data); - for (int i = 0; i < rnd_dims[0]; ++i) { - for (int j = 0; j < rnd_dims[1]; ++j) { - for (int k = 0; k < rnd_dims[2]; ++k) { - for (int l = 0; l < rnd_dims[3]; ++l) { - for (int m = 0; m < rnd_dims[4]; ++m) { - Index output_index = i * output_strides[0] + j * output_strides[1] + - k * output_strides[2] + l * output_strides[3] + - m * output_strides[4]; - Index left_index = i * left_strides[0] + j * left_strides[1] + - k * left_strides[2] + l * left_strides[3] + - m * left_strides[4]; - Index right_index = i * right_strides[0] + j * right_strides[1] + - k * right_strides[2] + l * right_strides[3] + - m * right_strides[4]; - VERIFY_IS_EQUAL( - output_data[output_index], - functor(left_data[left_index], right_data[right_index])); - } - } - } - } - } - - delete[] left_data; - delete[] right_data; - delete[] output_data; -} - template static void test_uniform_block_shape() { @@ -1196,21 +561,6 @@ static void test_empty_dims(const internal::TensorBlockShapeType block_shape) EIGEN_DECLARE_TEST(cxx11_tensor_block_access) { TEST_LAYOUTS(test_block_mapper_sanity); TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element); - TEST_LAYOUTS_AND_DIMS(float, test_slice_block_mapper_maps_every_element); - TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_data_from_source_to_target); - TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_data_from_source_to_target); - TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_using_reordered_dimensions); - TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_using_reordered_dimensions); - TEST_LAYOUTS(test_block_io_copy_using_reordered_dimensions_do_not_squeeze); - TEST_LAYOUTS(test_block_io_copy_using_reordered_dimensions_squeeze); - TEST_LAYOUTS(test_block_io_zero_stride); - TEST_LAYOUTS(test_block_io_squeeze_ones); - TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_unary_io_basic); - TEST_LAYOUTS(test_block_cwise_unary_io_squeeze_ones); - TEST_LAYOUTS(test_block_cwise_unary_io_zero_strides); - TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_binary_io_basic); - TEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones); - TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides); TEST_LAYOUTS(test_uniform_block_shape); TEST_LAYOUTS(test_skewed_inner_dim_block_shape); TEST_LAYOUTS_WITH_ARG(test_empty_dims, internal::kUniformAllDims); diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index 0e70e1770..66b06e8ee 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -310,48 +310,6 @@ static void test_execute_shuffle_lvalue(Device d) } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims)); } -template -static void test_execute_reduction(Device d) -{ - static_assert(NumDims >= 2, "NumDims must be greater or equal than 2"); - - static constexpr int ReducedDims = NumDims - 2; - static constexpr int Options = 0 | Layout; - - auto dims = RandomDims(5, 10); - Tensor src(dims); - src.setRandom(); - - // Pick two random and unique reduction dimensions. - int reduction0 = internal::random(0, NumDims - 1); - int reduction1 = internal::random(0, NumDims - 1); - while (reduction0 == reduction1) { - reduction1 = internal::random(0, NumDims - 1); - } - - DSizes reduction_axis; - reduction_axis[0] = reduction0; - reduction_axis[1] = reduction1; - - Tensor golden = src.sum(reduction_axis); - - // Now do the reduction using configured tensor executor. - Tensor dst(golden.dimensions()); - - auto expr = src.sum(reduction_axis); - - using Assign = TensorAssignOp; - using Executor = - internal::TensorExecutor; - - Executor::run(Assign(dst, expr), d); - - for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { - VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); - } -} - template static void test_execute_reshape(Device d) @@ -663,57 +621,34 @@ static void test_async_execute_binary_expr(Device d) #define CALL_SUBTEST_PART(PART) \ CALL_SUBTEST_##PART -#define CALL_SUBTEST_COMBINATIONS_V1(PART, NAME, T, NUM_DIMS) \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))) - - // NOTE: Tiling V2 currently implemented for a limited types of expression, and only with default device. -#define CALL_SUBTEST_COMBINATIONS_V2(PART, NAME, T, NUM_DIMS) \ +#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \ CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME(default_device))); \ CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME(default_device))); \ CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME(default_device))); \ CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ - CALL_SUBTEST_PART(PART)((NAME(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME(default_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))) + CALL_SUBTEST_PART(PART)((NAME(tp_device))) // NOTE: Currently only ThreadPoolDevice supports async expression evaluation. #define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ - CALL_SUBTEST_PART(PART)((NAME(tp_device))) + CALL_SUBTEST_PART(PART)((NAME(tp_device))) EIGEN_DECLARE_TEST(cxx11_tensor_executor) { Eigen::DefaultDevice default_device; @@ -724,69 +659,64 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) { Eigen::ThreadPool tp(num_threads); Eigen::ThreadPoolDevice tp_device(&tp, num_threads); - CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 3); - CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 4); - CALL_SUBTEST_COMBINATIONS_V2(1, test_execute_unary_expr, float, 5); - - CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 3); - CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 4); - CALL_SUBTEST_COMBINATIONS_V2(2, test_execute_binary_expr, float, 5); - - CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 3); - CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 4); - CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 5); - - CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 3); - CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 4); - CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 5); - - CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 3); - CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 4); - CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 5); - - CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 3); - CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 4); - CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 5); - - CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 3); - CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 4); - CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 5); - - CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 2); - CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 3); - CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 4); - CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 5); - - CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 2); - CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 3); - CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 4); - CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 5); - - CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 2); - CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 3); - CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 4); - CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 5); - - CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 2); - CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 3); - CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 4); - CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 5); - - CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 2); - CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 3); - CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 4); - CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 5); - - CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 2); - CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 3); - CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 4); - CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 5); - - CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 1); - CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 2); - CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 3); - CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 4); - CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 5); + CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3); + CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4); + CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5); + + CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3); + CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4); + CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5); + + CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3); + CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4); + CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5); + + CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5); + + CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5); + + CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5); + + CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5); + + CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2); + CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3); + CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4); + CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5); + + CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2); + CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5); + + CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2); + CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5); + + CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2); + CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3); + CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4); + CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5); + + CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2); + CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3); + CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4); + CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5); + + CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1); + CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2); + CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5); CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3); CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4); -- cgit v1.2.3