aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2011-11-12 09:19:48 +0100
committerGravatar Gael Guennebaud <g.gael@free.fr>2011-11-12 09:19:48 +0100
commit3e4a68cc60197b75e4bc9715f6752906c420c50b (patch)
treefa5d90ec4a1d75598416871110595508af57530d
parentc110abb7d255d06fcb4001671fa431d2d712fef8 (diff)
optimize vectorized reductions by peeling the loop:
- x2 for squaredNorm() on double - peeling the loop with a peeling factor of 4 leads to even better perf for large vectors (e.g., >64) but it makes more difficult to keep good performance on smaller ones.
-rw-r--r--Eigen/src/Core/Redux.h25
1 files changed, 19 insertions, 6 deletions
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index f9f5a95d5..f25dd2eeb 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -219,15 +219,28 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
alignment = bool(Derived::Flags & DirectAccessBit) || bool(Derived::Flags & AlignedBit)
? Aligned : Unaligned
};
- const Index alignedSize = ((size-alignedStart)/packetSize)*packetSize;
- const Index alignedEnd = alignedStart + alignedSize;
+ const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
+ const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
+ const Index alignedEnd2 = alignedStart + alignedSize2;
+ const Index alignedEnd = alignedStart + alignedSize;
Scalar res;
if(alignedSize)
{
- PacketScalar packet_res = mat.template packet<alignment>(alignedStart);
- for(Index index = alignedStart + packetSize; index < alignedEnd; index += packetSize)
- packet_res = func.packetOp(packet_res, mat.template packet<alignment>(index));
- res = func.predux(packet_res);
+ PacketScalar packet_res0 = mat.template packet<alignment>(alignedStart);
+ if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
+ {
+ PacketScalar packet_res1 = mat.template packet<alignment>(alignedStart+packetSize);
+ for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
+ {
+ packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment>(index));
+ packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment>(index+packetSize));
+ }
+
+ packet_res0 = func.packetOp(packet_res0,packet_res1);
+ if(alignedEnd>alignedEnd2)
+ packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment>(alignedEnd2));
+ }
+ res = func.predux(packet_res0);
for(Index index = 0; index < alignedStart; ++index)
res = func(res,mat.coeff(index));