diff options
author | 2008-08-09 18:41:24 +0000 | |
---|---|---|
committer | 2008-08-09 18:41:24 +0000 | |
commit | 4fa40367e9bf55ea8b2ad1040b3fb73f94e4f12f (patch) | |
tree | 3ca6d7cff691daf2d6bc8d6b1ecb00971f9debf3 /Eigen/src/Core/Sum.h | |
parent | becbeda50ac17288dba0a93c6adc67b663d32a7a (diff) |
* Big change in Block and Map:
- added a MapBase base xpr on top of which Map and the specialization
of Block are implemented
- MapBase forces both aligned loads (and aligned stores, see below) in expressions
such as "x.block(...) += other_expr"
* Significant vectorization improvement:
- added a AlignedBit flag meaning the first coeff/packet is aligned,
this allows to not generate extra code to deal with the first unaligned part
- removed all unaligned stores when no unrolling
- removed unaligned loads in Sum when the input as the DirectAccessBit flag
* Some code simplification in CacheFriendly product
* Some minor documentation improvements
Diffstat (limited to 'Eigen/src/Core/Sum.h')
-rw-r--r-- | Eigen/src/Core/Sum.h | 36 |
1 files changed, 25 insertions, 11 deletions
diff --git a/Eigen/src/Core/Sum.h b/Eigen/src/Core/Sum.h index fa75429c8..6c7280800 100644 --- a/Eigen/src/Core/Sum.h +++ b/Eigen/src/Core/Sum.h @@ -33,17 +33,22 @@ template<typename Derived> struct ei_sum_traits { +private: + enum { + PacketSize = ei_packet_traits<typename Derived::Scalar>::size + }; + public: enum { Vectorization = (int(Derived::Flags)&ActualPacketAccessBit) && (int(Derived::Flags)&LinearAccessBit) + && (int(Derived::SizeAtCompileTime)>2*PacketSize) ? LinearVectorization : NoVectorization }; private: enum { - PacketSize = ei_packet_traits<typename Derived::Scalar>::size, Cost = Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * NumTraits<typename Derived::Scalar>::AddCost, UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Vectorization) == int(NoVectorization) ? 1 : int(PacketSize)) @@ -131,7 +136,8 @@ struct ei_sum_vec_unroller<Derived, Index, Stop, true> : Index % Derived::RowsAtCompileTime, col = int(Derived::Flags)&RowMajorBit ? Index % int(Derived::ColsAtCompileTime) - : Index / Derived::RowsAtCompileTime + : Index / Derived::RowsAtCompileTime, + alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned }; typedef typename Derived::Scalar Scalar; @@ -139,7 +145,7 @@ struct ei_sum_vec_unroller<Derived, Index, Stop, true> inline static PacketScalar run(const Derived &mat) { - return mat.template packet<Aligned>(row, col); + return mat.template packet<alignment>(row, col); } }; @@ -185,14 +191,21 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling> { const int size = mat.size(); const int packetSize = ei_packet_traits<Scalar>::size; - const int alignedSize = (size/packetSize)*packetSize; + const int alignedStart = (Derived::Flags & AlignedBit) + || !(Derived::Flags & DirectAccessBit) + ? 0 + : ei_alignmentOffset(&mat.const_cast_derived().coeffRef(0), size); + const int alignment = (Derived::Flags & DirectAccessBit) || (Derived::Flags & AlignedBit) + ? Aligned : Unaligned; + const int alignedSize = ((size-alignedStart)/packetSize)*packetSize; + const int alignedEnd = alignedStart + alignedSize; Scalar res; - if(size >= packetSize) + if(Derived::SizeAtCompileTime>=2*packetSize && alignedSize >= 2*packetSize) { - PacketScalar packet_res = mat.template packet<Aligned>(0, 0); - for(int index = packetSize; index < alignedSize; index += packetSize) - packet_res = ei_padd(packet_res, mat.template packet<Aligned>(index)); + PacketScalar packet_res = mat.template packet<alignment>(alignedStart, alignedStart); + for(int index = alignedStart + packetSize; index < alignedEnd; index += packetSize) + packet_res = ei_padd(packet_res, mat.template packet<alignment>(index)); res = ei_predux(packet_res); } @@ -202,10 +215,11 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling> res = Scalar(0); } - for(int index = alignedSize; index < size; index++) - { + for(int index = alignedEnd; index < size; index++) + res += mat.coeff(index); + + for(int index = alignedEnd; index < size; index++) res += mat.coeff(index); - } return res; } |