diff options
Diffstat (limited to 'unsupported')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 38 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h | 7 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 158 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h | 22 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_executor.cpp | 43 |
6 files changed, 244 insertions, 26 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 576a4f3ec..60a07d6eb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -76,6 +76,8 @@ class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType, MakePointer_>, typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind; typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index; + static const int NumDims = Eigen::internal::traits<TensorEvalToOp>::NumDimensions; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr) : m_xpr(expr), m_buffer(buffer) {} @@ -106,15 +108,22 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device> typedef StorageMemory<CoeffReturnType, Device> Storage; typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = false, + IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = true, PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = true + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = true }; + typedef typename internal::TensorBlock< + CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout> + TensorBlock; + typedef typename internal::TensorBlockReader< + CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout> + TensorBlockReader; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){} @@ -138,6 +147,18 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device> internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) { + TensorBlock eval_to_block(block->first_coeff_index(), block->block_sizes(), + block->tensor_strides(), block->tensor_strides(), + m_buffer + block->first_coeff_index()); + m_impl.block(&eval_to_block); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -153,6 +174,11 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device> return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const { + assert(m_buffer != NULL); + TensorBlockReader::Run(block, m_buffer); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { // We assume that evalPacket or evalScalar is called to perform the // assignment and account for the cost of the write here. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 47e9b24ec..f1ae548f7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -346,7 +346,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr // expressions. const int thread_idx = device.currentThreadId(); eigen_assert(thread_idx >= -1 && thread_idx < num_threads); - Scalar* thread_buf = reinterpret_cast<Scalar*>( + ScalarNoConst* thread_buf = reinterpret_cast<ScalarNoConst*>( static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1)); for (StorageIndex i = firstIdx; i < lastIdx; ++i) { auto block = block_mapper.GetBlockForIndex(i, thread_buf); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index e7b7c1e6b..27a15bfb2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -133,7 +133,12 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device> typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo; EvalTo evalToTmp(m_device.get(m_buffer), m_op); const bool Vectorize = internal::IsVectorizable<Device, const ArgType>::value; - internal::TensorExecutor<const EvalTo, typename internal::remove_const<Device>::type, Vectorize>::run(evalToTmp, m_device); + const bool Tile = TensorEvaluator<const ArgType, Device>::BlockAccess && + TensorEvaluator<const ArgType, Device>::PreferBlockAccess; + + internal::TensorExecutor<const EvalTo, + typename internal::remove_const<Device>::type, + Vectorize, Tile>::run(evalToTmp, m_device); return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 3a699095c..64fed6de7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -113,18 +113,25 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = false, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = true, + PreferBlockAccess = true, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false, + }; + typedef typename internal::remove_const<Scalar>::type ScalarNoConst; + typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout> + OutputTensorBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_reverse(op.reverse()) + : m_impl(op.expression(), device), + m_reverse(op.reverse()), + m_device(device) { // Reversing a scalar isn't supported yet. It would be a no-op anyway. EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -142,6 +149,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device m_strides[i] = m_strides[i+1] * m_dimensions[i+1]; } } + // Remember the strides for fast division. + for (int i = 0; i < NumDims; ++i) { + m_fastStrides[i] = internal::TensorIntDivisor<Index>(m_strides[i]); + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -162,7 +173,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { - Index idx = index / m_strides[i]; + Index idx = index / m_fastStrides[i]; index -= idx * m_strides[i]; if (m_reverse[i]) { idx = m_dimensions[i] - idx - 1; @@ -177,7 +188,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device } else { EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { - Index idx = index / m_strides[i]; + Index idx = index / m_fastStrides[i]; index -= idx * m_strides[i]; if (m_reverse[i]) { idx = m_dimensions[i] - idx - 1; @@ -217,6 +228,131 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>( + 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); + resources->push_back(internal::TensorOpResourceRequirements( + internal::kSkewedInnerDims, block_total_size_max)); + } + + struct BlockIteratorState { + Index block_size; + Index block_stride; + Index block_span; + Index input_size; + Index input_stride; + Index input_span; + Index count; + bool reverse; + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + OutputTensorBlock* output_block) const { + if (NumDims <= 0) return; + + // TODO(ezhulenev): If underlying tensor expression supports and prefers + // block evaluation we must use it. Currently we use coeff and packet + // access into the underlying tensor expression. + // static const bool useBlockAccessForArgType = + // TensorEvaluator<ArgType, Device>::BlockAccess && + // TensorEvaluator<ArgType, Device>::PreferBlockAccess; + + static const bool isColMajor = + static_cast<int>(Layout) == static_cast<int>(ColMajor); + + static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1; + const bool inner_dim_reversed = m_reverse[inner_dim_idx]; + + CoeffReturnType* data = output_block->data(); + Index block_offset = 0; + + Index input_offset = reverseIndex(output_block->first_coeff_index()); + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array<BlockIteratorState, NumDims> it; + for (Index i = 0; i < NumDims; ++i) { + const Index dim = isColMajor ? i : NumDims - 1 - i; + it[i].block_size = output_block->block_sizes()[dim]; + it[i].block_stride = output_block->block_strides()[dim]; + it[i].block_span = it[i].block_stride * (it[i].block_size - 1); + it[i].input_size = m_dimensions[dim]; + it[i].input_stride = m_strides[dim]; + it[i].input_span = it[i].input_stride * (it[i].input_size - 1); + it[i].count = 0; + it[i].reverse = m_reverse[dim]; + + if (it[i].reverse) { + it[i].input_stride = -1 * it[i].input_stride; + it[i].input_span = -1 * it[i].input_span; + } + } + + // If multiple inner dimensions have the same reverse flag, check if we can + // merge them into a single virtual inner dimension. + int effective_inner_dim = 0; + for (int i = 1; i < NumDims; ++i) { + if (it[i].reverse != it[effective_inner_dim].reverse) break; + if (it[i].block_stride != it[effective_inner_dim].input_size) break; + if (it[i].block_stride != numext::abs(it[i].input_stride)) break; + + it[i].block_size = it[effective_inner_dim].block_size * it[i].block_size; + it[i].input_size = it[effective_inner_dim].input_size * it[i].input_size; + + it[i].block_stride = 1; + it[i].input_stride = (inner_dim_reversed ? -1 : 1); + + it[i].block_span = it[i].block_stride * (it[i].block_size - 1); + it[i].input_span = it[i].input_stride * (it[i].input_size - 1); + + effective_inner_dim = i; + } + + eigen_assert(it[effective_inner_dim].block_stride == 1); + eigen_assert(it[effective_inner_dim].input_stride == + (inner_dim_reversed ? -1 : 1)); + + const Index inner_dim_size = it[effective_inner_dim].block_size; + + while (it[NumDims - 1].count < it[NumDims - 1].block_size) { + // Copy inner-most dimension data from reversed location in input. + Index dst = block_offset; + Index src = input_offset; + + // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed + // worse results in benchmarks than a simple coefficient loop. + if (inner_dim_reversed) { + for (Index i = 0; i < inner_dim_size; ++i) { + data[dst] = m_impl.coeff(src); + ++dst; + --src; + } + } else { + for (Index i = 0; i < inner_dim_size; ++i) { + data[dst] = m_impl.coeff(src); + ++dst; + ++src; + } + } + + // For the 1d tensor we need to generate only one inner-most dimension. + if ((NumDims - effective_inner_dim) == 1) break; + + // Update offset. + for (Index i = effective_inner_dim + 1; i < NumDims; ++i) { + if (++it[i].count < it[i].block_size) { + block_offset += it[i].block_stride; + input_offset += it[i].input_stride; + break; + } + if (i != NumDims - 1) it[i].count = 0; + block_offset -= it[i].block_span; + input_offset -= it[i].input_span; + } + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + @@ -242,8 +378,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device protected: Dimensions m_dimensions; array<Index, NumDims> m_strides; + array<internal::TensorIntDivisor<Index>, NumDims> m_fastStrides; TensorEvaluator<ArgType, Device> m_impl; ReverseDimensions m_reverse; + const Device& m_device; }; // Eval as lvalue diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index ae04785ce..ad6332179 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -273,6 +273,11 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> input_block_strides[i + 1] * input_block_sizes[i + 1]; } } + DSizes<internal::TensorIntDivisor<Index>, NumDims> fast_input_block_strides; + for (int i = 0; i < NumDims; ++i) { + fast_input_block_strides[i] = + internal::TensorIntDivisor<Index>(input_block_strides[i]); + } // Read input block. TensorBlock input_block(srcCoeff(output_block->first_coeff_index()), @@ -296,8 +301,9 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> continue; } - Index output_index = GetBlockOutputIndex(input_index, input_block_strides, - output_block_strides); + Index output_index = + GetBlockOutputIndex(input_index, input_block_strides, + output_block_strides, fast_input_block_strides); if (output_index == input_index) { // Coefficient already in place. bitmap[output_index] = true; @@ -315,8 +321,9 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> data[output_index] = shuffled_value; shuffled_value = evicted_value; bitmap[output_index] = true; - output_index = GetBlockOutputIndex(output_index, input_block_strides, - output_block_strides); + output_index = + GetBlockOutputIndex(output_index, input_block_strides, + output_block_strides, fast_input_block_strides); } while (output_index != input_index); data[output_index] = shuffled_value; @@ -345,11 +352,12 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex( Index input_index, const DSizes<Index, NumDims>& input_block_strides, - const DSizes<Index, NumDims>& output_block_strides) const { + const DSizes<Index, NumDims>& output_block_strides, + const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const { Index output_index = 0; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { - const Index idx = input_index / input_block_strides[i]; + const Index idx = input_index / fast_input_block_strides[i]; output_index += idx * output_block_strides[m_inverseShuffle[i]]; input_index -= idx * input_block_strides[i]; } @@ -357,7 +365,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> output_block_strides[m_inverseShuffle[0]]; } else { for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = input_index / input_block_strides[i]; + const Index idx = input_index / fast_input_block_strides[i]; output_index += idx * output_block_strides[m_inverseShuffle[i]]; input_index -= idx * input_block_strides[i]; } diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index 8ea03382d..e9922a48d 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -527,6 +527,41 @@ static void test_execute_generator_op(Device d) } } +template <typename T, int NumDims, typename Device, bool Vectorizable, + bool Tileable, int Layout> +static void test_execute_reverse_rvalue(Device d) +{ + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims)); + Tensor <T, NumDims, Options, Index> src(dims); + src.setRandom(); + + // Reverse half of the dimensions. + Eigen::array<bool, NumDims> reverse; + for (int i = 0; i < NumDims; ++i) reverse[i] = (dims[i] % 2 == 0); + + const auto expr = src.reverse(reverse); + + // We assume that reversing on a default device is tested and correct, so + // we can rely on it to verify correctness of tensor executor and tiling. + Tensor <T, NumDims, Options, Index> golden; + golden = expr; + + // Now do the reversing using configured tensor executor. + Tensor <T, NumDims, Options, Index> dst(golden.dimensions()); + + using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; + using Executor = + internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +} + #ifdef EIGEN_DONT_VECTORIZE #define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL #else @@ -619,8 +654,14 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) { CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4); CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5); + CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1); + CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2); + CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5); + // Force CMake to split this test. - // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13 + // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14 } #undef CALL_SUBTEST_COMBINATIONS |