// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2011-2018 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_PARTIALREDUX_H #define EIGEN_PARTIALREDUX_H namespace Eigen { namespace internal { /*************************************************************************** * * This file provides evaluators for partial reductions. * There are two modes: * * - scalar path: simply calls the respective function on the column or row. * -> nothing special here, all the tricky part is handled by the return * types of VectorwiseOp's members. They embed the functor calling the * respective DenseBase's member function. * * - vectorized path: implements a packet-wise reductions followed by * some (optional) processing of the outcome, e.g., division by n for mean. * * For the vectorized path let's observe that the packet-size and outer-unrolling * are both decided by the assignement logic. So all we have to do is to decide * on the inner unrolling. * * For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h, * but be need to be careful to specify correct increment. * ***************************************************************************/ /* logic deciding a strategy for unrolling of vectorized paths */ template struct packetwise_redux_traits { enum { OuterSize = int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime, Cost = OuterSize == Dynamic ? HugeCost : OuterSize * Evaluator::CoeffReadCost + (OuterSize-1) * functor_traits::Cost, Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling }; }; /* Value to be returned when size==0 , by default let's return 0 */ template EIGEN_DEVICE_FUNC PacketType packetwise_redux_empty_value(const Func& ) { return pset1(0); } /* For products the default is 1 */ template EIGEN_DEVICE_FUNC PacketType packetwise_redux_empty_value(const scalar_product_op& ) { return pset1(1); } /* Perform the actual reduction */ template::Unrolling > struct packetwise_redux_impl; /* Perform the actual reduction with unrolling */ template struct packetwise_redux_impl { typedef redux_novec_unroller Base; typedef typename Evaluator::Scalar Scalar; template EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func& func, Index /*size*/) { return redux_vec_unroller::OuterSize>::template run(eval,func); } }; /* Add a specialization of redux_vec_unroller for size==0 at compiletime. * This specialization is not required for general reductions, which is * why it is defined here. */ template struct redux_vec_unroller { template EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE PacketType run(const Evaluator &, const Func& f) { return packetwise_redux_empty_value(f); } }; /* Perform the actual reduction for dynamic sizes */ template struct packetwise_redux_impl { typedef typename Evaluator::Scalar Scalar; typedef typename redux_traits::PacketType PacketScalar; template EIGEN_DEVICE_FUNC static PacketType run(const Evaluator &eval, const Func& func, Index size) { if(size==0) return packetwise_redux_empty_value(func); const Index size4 = (size-1)&(~3); PacketType p = eval.template packetByOuterInner(0,0); Index i = 1; // This loop is optimized for instruction pipelining: // - each iteration generates two independent instructions // - thanks to branch prediction and out-of-order execution we have independent instructions across loops for(; i(i+0,0),eval.template packetByOuterInner(i+1,0)), func.packetOp(eval.template packetByOuterInner(i+2,0),eval.template packetByOuterInner(i+3,0)))); for(; i(i,0)); return p; } }; template< typename ArgType, typename MemberOp, int Direction> struct evaluator > : evaluator_base > { typedef PartialReduxExpr XprType; typedef typename internal::nested_eval::type ArgTypeNested; typedef typename internal::add_const_on_value_type::type ConstArgTypeNested; typedef typename internal::remove_all::type ArgTypeNestedCleaned; typedef typename ArgType::Scalar InputScalar; typedef typename XprType::Scalar Scalar; enum { TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) : int(ArgType::ColsAtCompileTime) }; typedef typename MemberOp::template Cost CostOpType; enum { CoeffReadCost = TraversalSize==Dynamic ? HugeCost : TraversalSize==0 ? 1 : int(TraversalSize) * int(evaluator::CoeffReadCost) + int(CostOpType::value), _ArgFlags = evaluator::Flags, _Vectorizable = bool(int(_ArgFlags)&PacketAccessBit) && bool(MemberOp::Vectorizable) && (Direction==int(Vertical) ? bool(_ArgFlags&RowMajorBit) : (_ArgFlags&RowMajorBit)==0) && (TraversalSize!=0), Flags = (traits::Flags&RowMajorBit) | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))) | (_Vectorizable ? PacketAccessBit : 0) | LinearAccessBit, Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized }; EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr) : m_arg(xpr.nestedExpression()), m_functor(xpr.functor()) { EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : (TraversalSize==0 ? 1 : int(CostOpType::value))); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } typedef typename XprType::CoeffReturnType CoeffReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const { return coeff(Direction==Vertical ? j : i); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index index) const { return m_functor(m_arg.template subVector(index)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index i, Index j) const { return packet(Direction==Vertical ? j : i); } template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PacketType packet(Index idx) const { enum { PacketSize = internal::unpacket_traits::size }; typedef Block PanelType; PanelType panel(m_arg, Direction==Vertical ? 0 : idx, Direction==Vertical ? idx : 0, Direction==Vertical ? m_arg.rows() : Index(PacketSize), Direction==Vertical ? Index(PacketSize) : m_arg.cols()); // FIXME // See bug 1612, currently if PacketSize==1 (i.e. complex with 128bits registers) then the storage-order of panel get reversed // and methods like packetByOuterInner do not make sense anymore in this context. // So let's just by pass "vectorization" in this case: if(PacketSize==1) return internal::pset1(coeff(idx)); typedef typename internal::redux_evaluator PanelEvaluator; PanelEvaluator panel_eval(panel); typedef typename MemberOp::BinaryOp BinaryOp; PacketType p = internal::packetwise_redux_impl::template run(panel_eval,m_functor.binaryFunc(),m_arg.outerSize()); return p; } protected: ConstArgTypeNested m_arg; const MemberOp m_functor; }; } // end namespace internal } // end namespace Eigen #endif // EIGEN_PARTIALREDUX_H