diff options
-rw-r--r-- | Eigen/src/Core/AssignEvaluator.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/ProductEvaluators.h | 20 | ||||
-rw-r--r-- | Eigen/src/Core/Redux.h | 18 | ||||
-rw-r--r-- | test/unalignedcount.cpp | 9 | ||||
-rw-r--r-- | test/vectorization_logic.cpp | 55 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h | 6 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 12 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 48 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 40 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h | 4 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h | 8 | ||||
-rw-r--r-- | unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h | 14 |
12 files changed, 143 insertions, 93 deletions
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 362d905d2..83cec500f 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -172,6 +172,8 @@ public: EIGEN_DEBUG_VAR(MaySliceVectorize) std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl; EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost) + EIGEN_DEBUG_VAR(DstEvaluator::CoeffReadCost) + EIGEN_DEBUG_VAR(Dst::SizeAtCompileTime) EIGEN_DEBUG_VAR(UnrollingLimit) EIGEN_DEBUG_VAR(MayUnrollCompletely) EIGEN_DEBUG_VAR(MayUnrollInner) diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 2787987e7..0762d9e8b 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -803,13 +803,21 @@ public: MatrixFlags = evaluator<MatrixType>::Flags, DiagFlags = evaluator<DiagonalType>::Flags, - _StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor, + + _StorageOrder = (Derived::MaxRowsAtCompileTime==1 && Derived::MaxColsAtCompileTime!=1) ? RowMajor + : (Derived::MaxColsAtCompileTime==1 && Derived::MaxRowsAtCompileTime!=1) ? ColMajor + : MatrixFlags & RowMajorBit ? RowMajor : ColMajor, + _SameStorageOrder = _StorageOrder == (MatrixFlags & RowMajorBit ? RowMajor : ColMajor), + _ScalarAccessOnDiag = !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft) ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)), _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value, // FIXME currently we need same types, but in the future the next rule should be the one //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))), - _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), + _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) + && _SameTypes + && (_SameStorageOrder || (MatrixFlags&LinearAccessBit)==LinearAccessBit) + && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0, Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0), Alignment = evaluator<MatrixType>::Alignment, @@ -870,10 +878,10 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha typedef Product<Lhs, Rhs, ProductKind> XprType; typedef typename XprType::PlainObject PlainObject; + typedef typename Lhs::DiagonalVectorType DiagonalType; + - enum { - StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor - }; + enum { StorageOrder = Base::_StorageOrder }; EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) @@ -917,7 +925,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape, typedef Product<Lhs, Rhs, ProductKind> XprType; typedef typename XprType::PlainObject PlainObject; - enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor }; + enum { StorageOrder = Base::_StorageOrder }; EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index e449ef3ac..0aee855df 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -32,14 +32,20 @@ public: PacketSize = unpacket_traits<PacketType>::size, InnerMaxSize = int(Evaluator::IsRowMajor) ? Evaluator::MaxColsAtCompileTime - : Evaluator::MaxRowsAtCompileTime + : Evaluator::MaxRowsAtCompileTime, + OuterMaxSize = int(Evaluator::IsRowMajor) + ? Evaluator::MaxRowsAtCompileTime + : Evaluator::MaxColsAtCompileTime, + SliceVectorizedWork = int(InnerMaxSize)==Dynamic ? Dynamic + : int(OuterMaxSize)==Dynamic ? (int(InnerMaxSize)>=int(PacketSize) ? Dynamic : 0) + : (int(InnerMaxSize)/int(PacketSize)) * int(OuterMaxSize) }; enum { MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit) && (functor_traits<Func>::PacketAccess), MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit), - MaySliceVectorize = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize + MaySliceVectorize = bool(MightVectorize) && (int(SliceVectorizedWork)==Dynamic || int(SliceVectorizedWork)>=3) }; public: @@ -69,13 +75,15 @@ public: EIGEN_DEBUG_VAR(Evaluator::Flags) std::cerr.unsetf(std::ios::hex); EIGEN_DEBUG_VAR(InnerMaxSize) + EIGEN_DEBUG_VAR(OuterMaxSize) + EIGEN_DEBUG_VAR(SliceVectorizedWork) EIGEN_DEBUG_VAR(PacketSize) EIGEN_DEBUG_VAR(MightVectorize) EIGEN_DEBUG_VAR(MayLinearVectorize) EIGEN_DEBUG_VAR(MaySliceVectorize) - EIGEN_DEBUG_VAR(Traversal) + std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl; EIGEN_DEBUG_VAR(UnrollingLimit) - EIGEN_DEBUG_VAR(Unrolling) + std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl; std::cerr << std::endl; } #endif @@ -402,7 +410,7 @@ DenseBase<Derived>::redux(const Func& func) const typedef typename internal::redux_evaluator<Derived> ThisEvaluator; ThisEvaluator thisEval(derived()); - + // The initial expression is passed to the reducer as an additional argument instead of // passing it as a member of redux_evaluator to help return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived()); diff --git a/test/unalignedcount.cpp b/test/unalignedcount.cpp index 069fc1bb9..52cdd9e1d 100644 --- a/test/unalignedcount.cpp +++ b/test/unalignedcount.cpp @@ -30,7 +30,14 @@ static int nb_storeu; EIGEN_DECLARE_TEST(unalignedcount) { - #if defined(EIGEN_VECTORIZE_AVX) + #if defined(EIGEN_VECTORIZE_AVX512) + VectorXf a(48), b(48); + VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 6, 0, 3, 0); + VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) += b.segment(0,48), 3, 3, 3, 0); + VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) -= b.segment(0,48), 3, 3, 3, 0); + VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) *= 3.5, 3, 0, 3, 0); + VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) /= 3.5, 3, 0, 3, 0); + #elif defined(EIGEN_VECTORIZE_AVX) VectorXf a(40), b(40); VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 10, 0, 5, 0); VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) += b.segment(0,40), 5, 5, 5, 0); diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index c15f75103..e2146eef3 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -37,6 +37,7 @@ using internal::demangle_unrolling; template<typename Dst, typename Src> bool test_assign(const Dst&, const Src&, int traversal, int unrolling) { + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src); typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar,typename Src::Scalar> > traits; bool res = traits::Traversal==traversal; if(unrolling==InnerUnrolling+CompleteUnrolling) @@ -61,6 +62,7 @@ bool test_assign(const Dst&, const Src&, int traversal, int unrolling) template<typename Dst, typename Src> bool test_assign(int traversal, int unrolling) { + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src); typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar,typename Src::Scalar> > traits; bool res = traits::Traversal==traversal && traits::Unrolling==unrolling; if(!res) @@ -117,26 +119,26 @@ struct vectorization_logic typedef Matrix<Scalar,Dynamic,1> VectorX; typedef Matrix<Scalar,Dynamic,Dynamic> MatrixXX; typedef Matrix<Scalar,PacketSize,PacketSize> Matrix11; - typedef Matrix<Scalar,2*PacketSize,2*PacketSize> Matrix22; + typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?8:2*PacketSize,(Matrix11::Flags&RowMajorBit)?2*PacketSize:8> Matrix22; typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16> Matrix44; typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16,DontAlign|EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION> Matrix44u; typedef Matrix<Scalar,4*PacketSize,4*PacketSize,ColMajor> Matrix44c; typedef Matrix<Scalar,4*PacketSize,4*PacketSize,RowMajor> Matrix44r; typedef Matrix<Scalar, - (PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1), - (PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1) + (PacketSize==16 ? 8 : PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1), + (PacketSize==16 ? 2 : PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1) > Matrix1; typedef Matrix<Scalar, - (PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1), - (PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1), + (PacketSize==16 ? 8 : PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1), + (PacketSize==16 ? 2 : PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1), DontAlign|((Matrix1::Flags&RowMajorBit)?RowMajor:ColMajor)> Matrix1u; // this type is made such that it can only be vectorized when viewed as a linear 1D vector typedef Matrix<Scalar, - (PacketSize==8 ? 4 : PacketSize==4 ? 6 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?2:3) : /*PacketSize==1 ?*/ 1), - (PacketSize==8 ? 6 : PacketSize==4 ? 2 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?3:2) : /*PacketSize==1 ?*/ 3) + (PacketSize==16 ? 4 : PacketSize==8 ? 4 : PacketSize==4 ? 6 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?2:3) : /*PacketSize==1 ?*/ 1), + (PacketSize==16 ? 12 : PacketSize==8 ? 6 : PacketSize==4 ? 2 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?3:2) : /*PacketSize==1 ?*/ 3) > Matrix3; #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT @@ -202,7 +204,7 @@ struct vectorization_logic VERIFY(test_assign(Matrix11(), Matrix11()+Matrix11(),InnerVectorizedTraversal,CompleteUnrolling)); - VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4), + VERIFY(test_assign(Matrix11(),Matrix<Scalar,21,21>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,21,21>().template block<PacketSize,PacketSize>(3,2), (EIGEN_UNALIGNED_VECTORIZE) ? InnerVectorizedTraversal : DefaultTraversal, CompleteUnrolling|InnerUnrolling)); VERIFY(test_assign(Vector1(),Matrix11()*Vector1(), @@ -230,8 +232,13 @@ struct vectorization_logic VERIFY(test_redux(Matrix44(), LinearVectorizedTraversal,NoUnrolling)); - VERIFY(test_redux(Matrix44().template block<(Matrix1::Flags&RowMajorBit)?4:PacketSize,(Matrix1::Flags&RowMajorBit)?PacketSize:4>(1,2), - DefaultTraversal,CompleteUnrolling)); + if(PacketSize>1) { + VERIFY(test_redux(Matrix44().template block<(Matrix1::Flags&RowMajorBit)?4:PacketSize,(Matrix1::Flags&RowMajorBit)?PacketSize:4>(1,2), + SliceVectorizedTraversal,CompleteUnrolling)); + + VERIFY(test_redux(Matrix44().template block<(Matrix1::Flags&RowMajorBit)?2:PacketSize,(Matrix1::Flags&RowMajorBit)?PacketSize:2>(1,2), + DefaultTraversal,CompleteUnrolling)); + } VERIFY(test_redux(Matrix44c().template block<2*PacketSize,1>(1,2), LinearVectorizedTraversal,CompleteUnrolling)); @@ -289,19 +296,19 @@ struct vectorization_logic_half // typedef Matrix<Scalar,4*PacketSize,4*PacketSize,RowMajor> Matrix44r; typedef Matrix<Scalar, - (PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1), - (PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1) + (PacketSize==16 ? 8 : PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1), + (PacketSize==16 ? 2 : PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1) > Matrix1; typedef Matrix<Scalar, - (PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1), - (PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1), + (PacketSize==16 ? 8 : PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1), + (PacketSize==16 ? 2 : PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1), DontAlign|((Matrix1::Flags&RowMajorBit)?RowMajor:ColMajor)> Matrix1u; // this type is made such that it can only be vectorized when viewed as a linear 1D vector typedef Matrix<Scalar, - (PacketSize==8 ? 4 : PacketSize==4 ? 6 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?2:3) : /*PacketSize==1 ?*/ 1), - (PacketSize==8 ? 6 : PacketSize==4 ? 2 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?3:2) : /*PacketSize==1 ?*/ 3) + (PacketSize==16 ? 4 : PacketSize==8 ? 4 : PacketSize==4 ? 6 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?2:3) : /*PacketSize==1 ?*/ 1), + (PacketSize==16 ? 12 : PacketSize==8 ? 6 : PacketSize==4 ? 2 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?3:2) : /*PacketSize==1 ?*/ 3) > Matrix3; #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT @@ -354,7 +361,8 @@ struct vectorization_logic_half NoUnrolling)); VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4), - EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling)); + EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : DefaultTraversal,InnerUnrolling+CompleteUnrolling)); + VERIFY(test_assign(Vector1(),Matrix11()*Vector1(), InnerVectorizedTraversal,CompleteUnrolling)); @@ -375,16 +383,21 @@ struct vectorization_logic_half VERIFY(test_redux(Matrix35(), LinearVectorizedTraversal,CompleteUnrolling)); - VERIFY(test_redux(Matrix57().template block<PacketSize,3>(1,0), - DefaultTraversal,CompleteUnrolling)); + VERIFY(test_redux(Matrix57().template block<PacketSize==1?2:PacketSize,3>(1,0), + SliceVectorizedTraversal,CompleteUnrolling)); + + if(PacketSize>1) { + VERIFY(test_redux(Matrix57().template block<PacketSize,2>(1,0), + DefaultTraversal,CompleteUnrolling)); + } VERIFY((test_assign< Map<Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>, AlignedMax, InnerStride<3*PacketSize> >, Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)> - >(DefaultTraversal,CompleteUnrolling))); + >(DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling))); VERIFY((test_assign(Matrix57(), Matrix<Scalar,5*PacketSize,3>()*Matrix<Scalar,3,7>(), - InnerVectorizedTraversal, InnerUnrolling|CompleteUnrolling))); + InnerVectorizedTraversal, InnerUnrolling+CompleteUnrolling))); #endif } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index dbb0f76bb..2d3b69128 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -255,7 +255,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1); const Index first = indexPair.first; - const Index last = indexPair.second; + const Index lastIdx = indexPair.second; // We can always do optimized packet reads from left hand side right now, because // the vertical matrix dimension on the left hand side is never contracting. @@ -263,7 +263,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, // been shuffled first. if (Tensor::PacketAccess && (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) && - (last - first) == (packet_size - 1)) { + (lastIdx - first) == (packet_size - 1)) { return this->m_tensor.template packet<AlignmentType>(first); } @@ -276,7 +276,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, data[k] = this->m_tensor.coeff(internal_pair.first); data[k + 1] = this->m_tensor.coeff(internal_pair.second); } - data[packet_size - 1] = this->m_tensor.coeff(last); + data[packet_size - 1] = this->m_tensor.coeff(lastIdx); return pload<PacketT>(data); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 6fc6688d3..1612c004b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -213,17 +213,17 @@ struct ThreadPoolDevice { // block_count leaves that do actual computations. Barrier barrier(static_cast<unsigned int>(block_count)); std::function<void(Index, Index)> handleRange; - handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) { - if (last - first <= block_size) { + handleRange = [=, &handleRange, &barrier, &f](Index firstIdx, Index lastIdx) { + if (lastIdx - firstIdx <= block_size) { // Single block or less, execute directly. - f(first, last); + f(firstIdx, lastIdx); barrier.Notify(); return; } // Split into halves and submit to the pool. - Index mid = first + divup((last - first) / 2, block_size) * block_size; - pool_->Schedule([=, &handleRange]() { handleRange(mid, last); }); - handleRange(first, mid); + Index mid = firstIdx + divup((lastIdx - firstIdx) / 2, block_size) * block_size; + pool_->Schedule([=, &handleRange]() { handleRange(mid, lastIdx); }); + handleRange(firstIdx, mid); }; handleRange(0, n); barrier.Wait(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index bfe1f97b8..1c44541bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -165,11 +165,11 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable, #ifdef EIGEN_USE_THREADS template <typename Evaluator, typename StorageIndex, bool Vectorizable> struct EvalRange { - static void run(Evaluator* evaluator_in, const StorageIndex first, - const StorageIndex last) { + static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, + const StorageIndex lastIdx) { Evaluator evaluator = *evaluator_in; - eigen_assert(last >= first); - for (StorageIndex i = first; i < last; ++i) { + eigen_assert(lastIdx >= firstIdx); + for (StorageIndex i = firstIdx; i < lastIdx; ++i) { evaluator.evalScalar(i); } } @@ -182,14 +182,14 @@ struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> { static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; - static void run(Evaluator* evaluator_in, const StorageIndex first, - const StorageIndex last) { + static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, + const StorageIndex lastIdx) { Evaluator evaluator = *evaluator_in; - eigen_assert(last >= first); - StorageIndex i = first; - if (last - first >= PacketSize) { - eigen_assert(first % PacketSize == 0); - StorageIndex last_chunk_offset = last - 4 * PacketSize; + eigen_assert(lastIdx >= firstIdx); + StorageIndex i = firstIdx; + if (lastIdx - firstIdx >= PacketSize) { + eigen_assert(firstIdx % PacketSize == 0); + StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize; // Give compiler a strong possibility to unroll the loop. But don't insist // on unrolling, because if the function is expensive compiler should not // unroll the loop at the expense of inlining. @@ -198,12 +198,12 @@ struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> { evaluator.evalPacket(i + j * PacketSize); } } - last_chunk_offset = last - PacketSize; + last_chunk_offset = lastIdx - PacketSize; for (; i <= last_chunk_offset; i += PacketSize) { evaluator.evalPacket(i); } } - for (; i < last; ++i) { + for (; i < lastIdx; ++i) { evaluator.evalScalar(i); } } @@ -234,8 +234,8 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> { const StorageIndex size = array_prod(evaluator.dimensions()); device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize, - [&evaluator](StorageIndex first, StorageIndex last) { - EvalRange::run(&evaluator, first, last); + [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) { + EvalRange::run(&evaluator, firstIdx, lastIdx); }); } evaluator.cleanup(); @@ -292,8 +292,8 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr void* buf = device.allocate((num_threads + 1) * aligned_blocksize); device.parallelFor( block_mapper.total_block_count(), cost * block_size, - [=, &device, &evaluator, &block_mapper](StorageIndex first, - StorageIndex last) { + [=, &device, &evaluator, &block_mapper](StorageIndex firstIdx, + StorageIndex lastIdx) { // currentThreadId() returns -1 if called from a thread not in the // thread pool, such as the main thread dispatching Eigen // expressions. @@ -301,7 +301,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr eigen_assert(thread_idx >= -1 && thread_idx < num_threads); Scalar* thread_buf = reinterpret_cast<Scalar*>( static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1)); - for (StorageIndex i = first; i < last; ++i) { + for (StorageIndex i = firstIdx; i < lastIdx; ++i) { auto block = block_mapper.GetBlockForIndex(i, thread_buf); evaluator.evalBlock(&block); } @@ -330,8 +330,8 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> { template <typename Evaluator, typename StorageIndex, bool Vectorizable> struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) { - for (StorageIndex i = first; i < last; i += step_size) { + void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) { + for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) { eval.evalScalar(i); } } @@ -340,17 +340,17 @@ struct EigenMetaKernelEval { template <typename Evaluator, typename StorageIndex> struct EigenMetaKernelEval<Evaluator, StorageIndex, true> { static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) { + void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) { const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; - const StorageIndex vectorized_size = (last / PacketSize) * PacketSize; + const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize; const StorageIndex vectorized_step_size = step_size * PacketSize; // Use the vector path - for (StorageIndex i = first * PacketSize; i < vectorized_size; + for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size; i += vectorized_step_size) { eval.evalPacket(i); } - for (StorageIndex i = vectorized_size + first; i < last; i += step_size) { + for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) { eval.evalScalar(i); } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 59c1704ed..4837f2200 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -273,21 +273,21 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device const Index initialIndex = index; Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { - const Index first = index; - const Index last = index + PacketSize - 1; + const Index firstIdx = index; + const Index lastIdx = index + PacketSize - 1; const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; const Index lastPaddedRight = m_outputStrides[i+1]; - if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1<PacketReturnType>(m_paddingValue); } - else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1<PacketReturnType>(m_paddingValue); } - else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. const Index idx = index / m_outputStrides[i]; inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; @@ -299,21 +299,21 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device } } - const Index last = index + PacketSize - 1; - const Index first = index; + const Index lastIdx = index + PacketSize - 1; + const Index firstIdx = index; const Index lastPaddedLeft = m_padding[0].first; const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); const Index lastPaddedRight = m_outputStrides[1]; - if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1<PacketReturnType>(m_paddingValue); } - else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1<PacketReturnType>(m_paddingValue); } - else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. inputIndex += (index - m_padding[0].first); return m_impl.template packet<Unaligned>(inputIndex); @@ -331,21 +331,21 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device Index inputIndex = 0; for (int i = 0; i < NumDims - 1; ++i) { - const Index first = index; - const Index last = index + PacketSize - 1; + const Index firstIdx = index; + const Index lastIdx = index + PacketSize - 1; const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; const Index lastPaddedRight = m_outputStrides[i]; - if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1<PacketReturnType>(m_paddingValue); } - else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1<PacketReturnType>(m_paddingValue); } - else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. const Index idx = index / m_outputStrides[i+1]; inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; @@ -357,21 +357,21 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device } } - const Index last = index + PacketSize - 1; - const Index first = index; + const Index lastIdx = index + PacketSize - 1; + const Index firstIdx = index; const Index lastPaddedLeft = m_padding[NumDims-1].first; const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); const Index lastPaddedRight = m_outputStrides[NumDims-1]; - if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(NumDims-1) && lastIdx < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1<PacketReturnType>(m_paddingValue); } - else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(NumDims-1) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1<PacketReturnType>(m_paddingValue); } - else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { + else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. inputIndex += (index - m_padding[NumDims-1].first); return m_impl.template packet<Unaligned>(inputIndex); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index 7504c1598..88940e6e6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -208,8 +208,8 @@ __global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Sel eigen_assert(blockDim.x == 1); eigen_assert(gridDim.x == 1); if (num_coeffs % 2 != 0) { - half last = input.m_impl.coeff(num_coeffs-1); - *scratch = __halves2half2(last, reducer.initialize()); + half lastCoeff = input.m_impl.coeff(num_coeffs-1); + *scratch = __halves2half2(lastCoeff, reducer.initialize()); } else { *scratch = reducer.template initializePacket<half2>(); } diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h index 22c952ae1..7a71f89fd 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h @@ -128,7 +128,7 @@ class EventCount { // Notify wakes one or all waiting threads. // Must be called after changing the associated wait predicate. - void Notify(bool all) { + void Notify(bool notifyAll) { std::atomic_thread_fence(std::memory_order_seq_cst); uint64_t state = state_.load(std::memory_order_acquire); for (;;) { @@ -137,7 +137,7 @@ class EventCount { return; uint64_t waiters = (state & kWaiterMask) >> kWaiterShift; uint64_t newstate; - if (all) { + if (notifyAll) { // Reset prewait counter and empty wait list. newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask; } else if (waiters) { @@ -157,10 +157,10 @@ class EventCount { } if (state_.compare_exchange_weak(state, newstate, std::memory_order_acquire)) { - if (!all && waiters) return; // unblocked pre-wait thread + if (!notifyAll && waiters) return; // unblocked pre-wait thread if ((state & kStackMask) == kStackMask) return; Waiter* w = &waiters_[state & kStackMask]; - if (!all) w->next.store(nullptr, std::memory_order_relaxed); + if (!notifyAll) w->next.store(nullptr, std::memory_order_relaxed); Unpark(w); return; } diff --git a/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h b/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h index 3f1ff14ad..42c99e467 100644 --- a/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h +++ b/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h @@ -228,6 +228,9 @@ template<typename _Scalar, int _Options, typename _StorageIndex> EIGEN_DEPRECATED inline DynamicSparseMatrix() : m_innerSize(0), m_data(0) { + #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + #endif eigen_assert(innerSize()==0 && outerSize()==0); } @@ -235,6 +238,9 @@ template<typename _Scalar, int _Options, typename _StorageIndex> EIGEN_DEPRECATED inline DynamicSparseMatrix(Index rows, Index cols) : m_innerSize(0) { + #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + #endif resize(rows, cols); } @@ -243,12 +249,18 @@ template<typename _Scalar, int _Options, typename _StorageIndex> EIGEN_DEPRECATED explicit inline DynamicSparseMatrix(const SparseMatrixBase<OtherDerived>& other) : m_innerSize(0) { - Base::operator=(other.derived()); + #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + #endif + Base::operator=(other.derived()); } inline DynamicSparseMatrix(const DynamicSparseMatrix& other) : Base(), m_innerSize(0) { + #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + #endif *this = other.derived(); } |