aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2008-06-23 15:50:28 +0000
committerGravatar Gael Guennebaud <g.gael@free.fr>2008-06-23 15:50:28 +0000
commitac9aa47bbc3ab6a6921c2df9d2430bc054196be6 (patch)
treeb994f6b39bc3ec25677352684b37e226a2a76630
parentea1990ef3d95a2e042b0ececdc4f21c0f5473cc2 (diff)
optimize linear vectorization both in Assign and Sum (optimal amortized perf)
-rw-r--r--Eigen/src/Core/Assign.h17
-rw-r--r--Eigen/src/Core/Sum.h35
2 files changed, 24 insertions, 28 deletions
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index 63eda1e85..c28a0371b 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -307,12 +307,17 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
int index = 0;
// do the vectorizable part of the assignment
- for ( ; index<alignedSize ; index+=packetSize)
+ int row = 0;
+ int col = 0;
+ while (index<alignedSize)
{
- // FIXME the following is not really efficient
- const int row = rowMajor ? index/innerSize : index%innerSize;
- const int col = rowMajor ? index%innerSize : index/innerSize;
- dst.template writePacket<Aligned>(row, col, src.template packet<Aligned>(row, col));
+ int start = rowMajor ? col : row;
+ int end = std::min(innerSize, start + alignedSize-index);
+ for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
+ dst.template writePacket<Aligned>(row, col, src.template packet<Aligned>(row, col));
+ index += (rowMajor ? col : row) - start;
+ row = rowMajor ? index/innerSize : index%innerSize;
+ col = rowMajor ? index%innerSize : index/innerSize;
}
// now we must do the rest without vectorization.
@@ -380,7 +385,7 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorization, NoUnrolling>
const int innerSize = rowMajor ? dst.cols() : dst.rows();
const int outerSize = rowMajor ? dst.rows() : dst.cols();
const int alignedInnerSize = (innerSize/packetSize)*packetSize;
-
+
for(int i = 0; i < outerSize; i++)
{
// do the vectorizable part of the assignment
diff --git a/Eigen/src/Core/Sum.h b/Eigen/src/Core/Sum.h
index c6b7cacce..d638f0979 100644
--- a/Eigen/src/Core/Sum.h
+++ b/Eigen/src/Core/Sum.h
@@ -54,7 +54,7 @@ public:
Unrolling = Cost <= UnrollingLimit
? CompleteUnrolling
: NoUnrolling
- };
+ };
};
/***************************************************************************
@@ -62,7 +62,7 @@ public:
***************************************************************************/
/*** no vectorization ***/
-
+
template<typename Derived, int Start, int Length>
struct ei_sum_novec_unroller
{
@@ -194,32 +194,23 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
// do the vectorizable part of the sum
if(size >= packetSize)
{
- asm("#begin");
-
PacketScalar packet_res;
packet_res = mat.template packet<Aligned>(0, 0);
- int index;
- if(Derived::IsVectorAtCompileTime)
- {
- for(index = packetSize; index<alignedSize ; index+=packetSize)
- {
- const int row = Derived::RowsAtCompileTime==1 ? 0 : index;
- const int col = Derived::RowsAtCompileTime==1 ? index : 0;
- packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
- }
- }
- else
+ int row = 0;
+ int col = 0;
+ int index = packetSize;
+ while (index<alignedSize)
{
- for(index = packetSize; index<alignedSize ; index+=packetSize)
- {
- // FIXME the following is not really efficient
- const int row = rowMajor ? index/innerSize : index%innerSize;
- const int col = rowMajor ? index%innerSize : index/innerSize;
+ row = rowMajor ? index/innerSize : index%innerSize;
+ col = rowMajor ? index%innerSize : index/innerSize;
+ int start = rowMajor ? col : row;
+ int end = std::min(innerSize, start+alignedSize-index);
+ if (end<start) getchar();
+ for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize)
packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col));
- }
+ index += (rowMajor ? col : row) - start;
}
res = ei_predux(packet_res);
- asm("#end");
// now we must do the rest without vectorization.
if(alignedSize == size) return res;