aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/Sum.h
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2008-08-09 18:41:24 +0000
committerGravatar Gael Guennebaud <g.gael@free.fr>2008-08-09 18:41:24 +0000
commit4fa40367e9bf55ea8b2ad1040b3fb73f94e4f12f (patch)
tree3ca6d7cff691daf2d6bc8d6b1ecb00971f9debf3 /Eigen/src/Core/Sum.h
parentbecbeda50ac17288dba0a93c6adc67b663d32a7a (diff)
* Big change in Block and Map:
- added a MapBase base xpr on top of which Map and the specialization of Block are implemented - MapBase forces both aligned loads (and aligned stores, see below) in expressions such as "x.block(...) += other_expr" * Significant vectorization improvement: - added a AlignedBit flag meaning the first coeff/packet is aligned, this allows to not generate extra code to deal with the first unaligned part - removed all unaligned stores when no unrolling - removed unaligned loads in Sum when the input as the DirectAccessBit flag * Some code simplification in CacheFriendly product * Some minor documentation improvements
Diffstat (limited to 'Eigen/src/Core/Sum.h')
-rw-r--r--Eigen/src/Core/Sum.h36
1 files changed, 25 insertions, 11 deletions
diff --git a/Eigen/src/Core/Sum.h b/Eigen/src/Core/Sum.h
index fa75429c8..6c7280800 100644
--- a/Eigen/src/Core/Sum.h
+++ b/Eigen/src/Core/Sum.h
@@ -33,17 +33,22 @@
template<typename Derived>
struct ei_sum_traits
{
+private:
+ enum {
+ PacketSize = ei_packet_traits<typename Derived::Scalar>::size
+ };
+
public:
enum {
Vectorization = (int(Derived::Flags)&ActualPacketAccessBit)
&& (int(Derived::Flags)&LinearAccessBit)
+ && (int(Derived::SizeAtCompileTime)>2*PacketSize)
? LinearVectorization
: NoVectorization
};
private:
enum {
- PacketSize = ei_packet_traits<typename Derived::Scalar>::size,
Cost = Derived::SizeAtCompileTime * Derived::CoeffReadCost
+ (Derived::SizeAtCompileTime-1) * NumTraits<typename Derived::Scalar>::AddCost,
UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Vectorization) == int(NoVectorization) ? 1 : int(PacketSize))
@@ -131,7 +136,8 @@ struct ei_sum_vec_unroller<Derived, Index, Stop, true>
: Index % Derived::RowsAtCompileTime,
col = int(Derived::Flags)&RowMajorBit
? Index % int(Derived::ColsAtCompileTime)
- : Index / Derived::RowsAtCompileTime
+ : Index / Derived::RowsAtCompileTime,
+ alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
};
typedef typename Derived::Scalar Scalar;
@@ -139,7 +145,7 @@ struct ei_sum_vec_unroller<Derived, Index, Stop, true>
inline static PacketScalar run(const Derived &mat)
{
- return mat.template packet<Aligned>(row, col);
+ return mat.template packet<alignment>(row, col);
}
};
@@ -185,14 +191,21 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
{
const int size = mat.size();
const int packetSize = ei_packet_traits<Scalar>::size;
- const int alignedSize = (size/packetSize)*packetSize;
+ const int alignedStart = (Derived::Flags & AlignedBit)
+ || !(Derived::Flags & DirectAccessBit)
+ ? 0
+ : ei_alignmentOffset(&mat.const_cast_derived().coeffRef(0), size);
+ const int alignment = (Derived::Flags & DirectAccessBit) || (Derived::Flags & AlignedBit)
+ ? Aligned : Unaligned;
+ const int alignedSize = ((size-alignedStart)/packetSize)*packetSize;
+ const int alignedEnd = alignedStart + alignedSize;
Scalar res;
- if(size >= packetSize)
+ if(Derived::SizeAtCompileTime>=2*packetSize && alignedSize >= 2*packetSize)
{
- PacketScalar packet_res = mat.template packet<Aligned>(0, 0);
- for(int index = packetSize; index < alignedSize; index += packetSize)
- packet_res = ei_padd(packet_res, mat.template packet<Aligned>(index));
+ PacketScalar packet_res = mat.template packet<alignment>(alignedStart, alignedStart);
+ for(int index = alignedStart + packetSize; index < alignedEnd; index += packetSize)
+ packet_res = ei_padd(packet_res, mat.template packet<alignment>(index));
res = ei_predux(packet_res);
}
@@ -202,10 +215,11 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
res = Scalar(0);
}
- for(int index = alignedSize; index < size; index++)
- {
+ for(int index = alignedEnd; index < size; index++)
+ res += mat.coeff(index);
+
+ for(int index = alignedEnd; index < size; index++)
res += mat.coeff(index);
- }
return res;
}