diff options
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 2 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h | 81 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 196 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_block_eval.cpp | 44 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_executor.cpp | 20 |
5 files changed, 303 insertions, 40 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index d1e4c82d2..8c44f1c4a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -131,7 +131,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device> ArgTensorBlock; typedef internal::TensorBlockAssignment< - Scalar, NumDims, typename ArgTensorBlock::XprType, Index> + CoeffReturnType, NumDims, typename ArgTensorBlock::XprType, Index> TensorBlockAssignment; //===--------------------------------------------------------------------===// diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 639e1dbb0..38d0bf7d3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -94,7 +94,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> IsAligned = false, PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), BlockAccess = true, - BlockAccessV2 = false, + BlockAccessV2 = true, PreferBlockAccess = true, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented @@ -107,7 +107,12 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> TensorBlock; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims, + Layout, Index> + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -232,6 +237,78 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + static const bool is_col_major = + static_cast<int>(Layout) == static_cast<int>(ColMajor); + + // Compute spatial coordinates for the first block element. + array<Index, NumDims> coords; + extract_coordinates(desc.offset(), coords); + array<Index, NumDims> initial_coords = coords; + + // Try to reuse destination as an output block buffer. + CoeffReturnType* block_buffer = + desc.template destination<CoeffReturnType, Layout>(); + bool materialized_in_output; + + if (block_buffer != NULL) { + materialized_in_output = true; + + } else { + materialized_in_output = false; + void* mem = scratch.allocate(desc.size() * sizeof(CoeffReturnType)); + block_buffer = static_cast<CoeffReturnType*>(mem); + } + + // Offset in the output block buffer. + Index offset = 0; + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array<BlockIteratorState, NumDims> it; + for (int i = 0; i < NumDims; ++i) { + const int dim = is_col_major ? i : NumDims - 1 - i; + it[i].size = desc.dimension(dim); + it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride); + it[i].span = it[i].stride * (it[i].size - 1); + it[i].count = 0; + } + eigen_assert(it[0].stride == 1); + + while (it[NumDims - 1].count < it[NumDims - 1].size) { + // Generate data for the inner-most dimension. + for (Index i = 0; i < it[0].size; ++i) { + *(block_buffer + offset + i) = m_generator(coords); + coords[is_col_major ? 0 : NumDims - 1]++; + } + coords[is_col_major ? 0 : NumDims - 1] = + initial_coords[is_col_major ? 0 : NumDims - 1]; + + // For the 1d tensor we need to generate only one inner-most dimension. + if (NumDims == 1) break; + + // Update offset. + for (Index i = 1; i < NumDims; ++i) { + if (++it[i].count < it[i].size) { + offset += it[i].stride; + coords[is_col_major ? i : NumDims - 1 - i]++; + break; + } + if (i != NumDims - 1) it[i].count = 0; + coords[is_col_major ? i : NumDims - 1 - i] = + initial_coords[is_col_major ? i : NumDims - 1 - i]; + offset -= it[i].span; + } + } + + return TensorBlockV2( + materialized_in_output + ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + block_buffer, desc.dimensions()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { // TODO(rmlarsen): This is just a placeholder. Define interface to make diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 855d04eb7..6e7abeb09 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -116,7 +116,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device IsAligned = false, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, BlockAccess = true, - BlockAccessV2 = false, + BlockAccessV2 = NumDims > 0, PreferBlockAccess = true, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented @@ -130,7 +130,15 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device OutputTensorBlock; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; + + typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2 + ArgTensorBlock; + + typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims, + Layout, Index> + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, @@ -240,17 +248,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device internal::kSkewedInnerDims, block_total_size_max)); } - struct BlockIteratorState { - Index block_size; - Index block_stride; - Index block_span; - Index input_size; - Index input_stride; - Index input_span; - Index count; - bool reverse; - }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( OutputTensorBlock* output_block) const { if (NumDims <= 0) return; @@ -278,15 +275,16 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device array<BlockIteratorState, NumDims> it; for (Index i = 0; i < NumDims; ++i) { const Index dim = isColMajor ? i : NumDims - 1 - i; - it[i].block_size = output_block->block_sizes()[dim]; - it[i].block_stride = output_block->block_strides()[dim]; - it[i].block_span = it[i].block_stride * (it[i].block_size - 1); - it[i].input_size = m_dimensions[dim]; - it[i].input_stride = m_strides[dim]; - it[i].input_span = it[i].input_stride * (it[i].input_size - 1); + it[i].size = output_block->block_sizes()[dim]; it[i].count = 0; it[i].reverse = m_reverse[dim]; + it[i].block_stride = output_block->block_strides()[dim]; + it[i].block_span = it[i].block_stride * (it[i].size - 1); + + it[i].input_stride = m_strides[dim]; + it[i].input_span = it[i].input_stride * (it[i].size - 1); + if (it[i].reverse) { it[i].input_stride = -1 * it[i].input_stride; it[i].input_span = -1 * it[i].input_span; @@ -298,17 +296,16 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device int effective_inner_dim = 0; for (int i = 1; i < NumDims; ++i) { if (it[i].reverse != it[effective_inner_dim].reverse) break; - if (it[i].block_stride != it[effective_inner_dim].input_size) break; + if (it[i].block_stride != it[effective_inner_dim].size) break; if (it[i].block_stride != numext::abs(it[i].input_stride)) break; - it[i].block_size = it[effective_inner_dim].block_size * it[i].block_size; - it[i].input_size = it[effective_inner_dim].input_size * it[i].input_size; + it[i].size = it[effective_inner_dim].size * it[i].size; it[i].block_stride = 1; it[i].input_stride = (inner_dim_reversed ? -1 : 1); - it[i].block_span = it[i].block_stride * (it[i].block_size - 1); - it[i].input_span = it[i].input_stride * (it[i].input_size - 1); + it[i].block_span = it[i].block_stride * (it[i].size - 1); + it[i].input_span = it[i].input_stride * (it[i].size - 1); effective_inner_dim = i; } @@ -317,9 +314,9 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device eigen_assert(it[effective_inner_dim].input_stride == (inner_dim_reversed ? -1 : 1)); - const Index inner_dim_size = it[effective_inner_dim].block_size; + const Index inner_dim_size = it[effective_inner_dim].size; - while (it[NumDims - 1].count < it[NumDims - 1].block_size) { + while (it[NumDims - 1].count < it[NumDims - 1].size) { // Copy inner-most dimension data from reversed location in input. Index dst = block_offset; Index src = input_offset; @@ -345,7 +342,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device // Update offset. for (Index i = effective_inner_dim + 1; i < NumDims; ++i) { - if (++it[i].count < it[i].block_size) { + if (++it[i].count < it[i].size) { block_offset += it[i].block_stride; input_offset += it[i].input_stride; break; @@ -357,6 +354,131 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + // TODO(ezhulenev): If underlying tensor expression supports and prefers + // block evaluation we must use it. Currently we use coeff and packet + // access into the underlying tensor expression. + // static const bool useBlockAccessForArgType = + // TensorEvaluator<ArgType, Device>::BlockAccess && + // TensorEvaluator<ArgType, Device>::PreferBlockAccess; + + static const bool isColMajor = + static_cast<int>(Layout) == static_cast<int>(ColMajor); + + static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1; + const bool inner_dim_reversed = m_reverse[inner_dim_idx]; + + // Try to reuse destination as an output block buffer. + CoeffReturnType* block_buffer = desc.template destination<CoeffReturnType, Layout>(); + bool materialized_in_output; + + if (block_buffer != NULL) { + materialized_in_output = true; + + } else { + materialized_in_output = false; + void* mem = scratch.allocate(desc.size() * sizeof(CoeffReturnType)); + block_buffer = static_cast<CoeffReturnType*>(mem); + } + + // Offset in the output block. + Index block_offset = 0; + + // Offset in the input Tensor. + Index input_offset = reverseIndex(desc.offset()); + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array<BlockIteratorState, NumDims> it; + for (int i = 0; i < NumDims; ++i) { + const int dim = isColMajor ? i : NumDims - 1 - i; + it[i].size = desc.dimension(dim); + it[i].count = 0; + it[i].reverse = m_reverse[dim]; + + it[i].block_stride = + i == 0 ? 1 : (it[i - 1].size * it[i - 1].block_stride); + it[i].block_span = it[i].block_stride * (it[i].size - 1); + + it[i].input_stride = m_strides[dim]; + it[i].input_span = it[i].input_stride * (it[i].size - 1); + + if (it[i].reverse) { + it[i].input_stride = -1 * it[i].input_stride; + it[i].input_span = -1 * it[i].input_span; + } + } + + // If multiple inner dimensions have the same reverse flag, check if we can + // merge them into a single virtual inner dimension. + int effective_inner_dim = 0; + for (int i = 1; i < NumDims; ++i) { + if (it[i].reverse != it[effective_inner_dim].reverse) break; + if (it[i].block_stride != it[effective_inner_dim].size) break; + if (it[i].block_stride != numext::abs(it[i].input_stride)) break; + + it[i].size = it[effective_inner_dim].size * it[i].size; + + it[i].block_stride = 1; + it[i].input_stride = (inner_dim_reversed ? -1 : 1); + + it[i].block_span = it[i].block_stride * (it[i].size - 1); + it[i].input_span = it[i].input_stride * (it[i].size - 1); + + effective_inner_dim = i; + } + + eigen_assert(it[effective_inner_dim].block_stride == 1); + eigen_assert(it[effective_inner_dim].input_stride == + (inner_dim_reversed ? -1 : 1)); + + const Index inner_dim_size = it[effective_inner_dim].size; + + while (it[NumDims - 1].count < it[NumDims - 1].size) { + // Copy inner-most dimension data from reversed location in input. + Index dst = block_offset; + Index src = input_offset; + + // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed + // worse results in benchmarks than a simple coefficient loop. + if (inner_dim_reversed) { + for (Index i = 0; i < inner_dim_size; ++i) { + block_buffer[dst] = m_impl.coeff(src); + ++dst; + --src; + } + } else { + for (Index i = 0; i < inner_dim_size; ++i) { + block_buffer[dst] = m_impl.coeff(src); + ++dst; + ++src; + } + } + + // For the 1d tensor we need to generate only one inner-most dimension. + if ((NumDims - effective_inner_dim) == 1) break; + + // Update offset. + for (Index i = effective_inner_dim + 1; i < NumDims; ++i) { + if (++it[i].count < it[i].size) { + block_offset += it[i].block_stride; + input_offset += it[i].input_stride; + break; + } + if (i != NumDims - 1) it[i].count = 0; + block_offset -= it[i].block_span; + input_offset -= it[i].input_span; + } + } + + return TensorBlockV2( + materialized_in_output + ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + block_buffer, desc.dimensions()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + @@ -386,6 +508,26 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device TensorEvaluator<ArgType, Device> m_impl; ReverseDimensions m_reverse; const Device EIGEN_DEVICE_REF m_device; + + private: + struct BlockIteratorState { + BlockIteratorState() + : size(0), + count(0), + reverse(false), + block_stride(0), + block_span(0), + input_stride(0), + input_span(0) {} + + Index size; + Index count; + bool reverse; + Index block_stride; + Index block_span; + Index input_stride; + Index input_span; + }; }; // Eval as lvalue diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp index e11092af3..aac75014c 100644 --- a/unsupported/test/cxx11_tensor_block_eval.cpp +++ b/unsupported/test/cxx11_tensor_block_eval.cpp @@ -369,6 +369,48 @@ static void test_eval_tensor_chipping() { [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); }); } +template <typename T, int NumDims, int Layout> +static void test_eval_tensor_generator() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20); + Tensor<T, NumDims, Layout> input(dims); + input.setRandom(); + + auto generator = [](const array<Index, NumDims>& dims) -> T { + T result = static_cast<T>(0); + for (int i = 0; i < NumDims; ++i) { + result += static_cast<T>((i + 1) * dims[i]); + } + return result; + }; + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.generate(generator), + [&dims]() { return FixedSizeBlock(dims); }); + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.generate(generator), + [&dims]() { return RandomBlock<Layout>(dims, 1, 10); }); +} + +template <typename T, int NumDims, int Layout> +static void test_eval_tensor_reverse() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20); + Tensor<T, NumDims, Layout> input(dims); + input.setRandom(); + + // Randomly reverse dimensions. + Eigen::DSizes<bool, NumDims> reverse; + for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>(); + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.reverse(reverse), + [&dims]() { return FixedSizeBlock(dims); }); + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.reverse(reverse), + [&dims]() { return RandomBlock<Layout>(dims, 1, 10); }); +} + template <typename T, int Layout> static void test_eval_tensor_reshape_with_bcast() { Index dim = internal::random<Index>(1, 100); @@ -573,6 +615,8 @@ EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) { CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_select); CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_padding); CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_chipping); + CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_generator); + CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_reverse); CALL_SUBTESTS_LAYOUTS(test_eval_tensor_reshape_with_bcast); CALL_SUBTESTS_LAYOUTS(test_eval_tensor_forced_eval); diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index 8fb4ba752..66f932746 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -539,7 +539,7 @@ static void test_execute_reverse_rvalue(Device d) // Reverse half of the dimensions. Eigen::array<bool, NumDims> reverse; - for (int i = 0; i < NumDims; ++i) reverse[i] = (dims[i] % 2 == 0); + for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>(); const auto expr = src.reverse(reverse); @@ -756,16 +756,16 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) { CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 4); CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 5); - CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 2); - CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 3); - CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 4); - CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 5); + CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 2); + CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 3); + CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 4); + CALL_SUBTEST_COMBINATIONS_V2(13, test_execute_generator_op, float, 5); - CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 1); - CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 2); - CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 3); - CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 4); - CALL_SUBTEST_COMBINATIONS_V1(14, test_execute_reverse_rvalue, float, 5); + CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 1); + CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 2); + CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 3); + CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 4); + CALL_SUBTEST_COMBINATIONS_V2(14, test_execute_reverse_rvalue, float, 5); CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3); CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4); |