aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h14
1 files changed, 13 insertions, 1 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 8332a9ae0..af9b58816 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -242,14 +242,26 @@ struct InnerMostDimReducer<Self, Op, true, true> {
}
return reducer.finalize(accum);
} else {
+ const typename Self::Index UnrollSize =
+ (numValuesToReduce / (2*packetSize)) * 2*packetSize;
const typename Self::Index VectorizedSize =
(numValuesToReduce / packetSize) * packetSize;
typename Self::PacketReturnType paccum =
reducer.template initializePacket<typename Self::PacketReturnType>();
- for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
+ typename Self::PacketReturnType paccum2 =
+ reducer.template initializePacket<typename Self::PacketReturnType>();
+ for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) {
reducer.reducePacket(
self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
+ reducer.reducePacket(
+ self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize),
+ &paccum2);
+ }
+ for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) {
+ reducer.reducePacket(self.m_impl.template packet<Unaligned>(
+ firstIndex + j), &paccum);
}
+ reducer.reducePacket(paccum2, &paccum);
for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
++j) {
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);