diff options
author | Gael Guennebaud <g.gael@free.fr> | 2008-06-23 15:50:28 +0000 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2008-06-23 15:50:28 +0000 |
commit | ac9aa47bbc3ab6a6921c2df9d2430bc054196be6 (patch) | |
tree | b994f6b39bc3ec25677352684b37e226a2a76630 | |
parent | ea1990ef3d95a2e042b0ececdc4f21c0f5473cc2 (diff) |
optimize linear vectorization both in Assign and Sum (optimal amortized perf)
-rw-r--r-- | Eigen/src/Core/Assign.h | 17 | ||||
-rw-r--r-- | Eigen/src/Core/Sum.h | 35 |
2 files changed, 24 insertions, 28 deletions
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h index 63eda1e85..c28a0371b 100644 --- a/Eigen/src/Core/Assign.h +++ b/Eigen/src/Core/Assign.h @@ -307,12 +307,17 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling> int index = 0; // do the vectorizable part of the assignment - for ( ; index<alignedSize ; index+=packetSize) + int row = 0; + int col = 0; + while (index<alignedSize) { - // FIXME the following is not really efficient - const int row = rowMajor ? index/innerSize : index%innerSize; - const int col = rowMajor ? index%innerSize : index/innerSize; - dst.template writePacket<Aligned>(row, col, src.template packet<Aligned>(row, col)); + int start = rowMajor ? col : row; + int end = std::min(innerSize, start + alignedSize-index); + for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize) + dst.template writePacket<Aligned>(row, col, src.template packet<Aligned>(row, col)); + index += (rowMajor ? col : row) - start; + row = rowMajor ? index/innerSize : index%innerSize; + col = rowMajor ? index%innerSize : index/innerSize; } // now we must do the rest without vectorization. @@ -380,7 +385,7 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorization, NoUnrolling> const int innerSize = rowMajor ? dst.cols() : dst.rows(); const int outerSize = rowMajor ? dst.rows() : dst.cols(); const int alignedInnerSize = (innerSize/packetSize)*packetSize; - + for(int i = 0; i < outerSize; i++) { // do the vectorizable part of the assignment diff --git a/Eigen/src/Core/Sum.h b/Eigen/src/Core/Sum.h index c6b7cacce..d638f0979 100644 --- a/Eigen/src/Core/Sum.h +++ b/Eigen/src/Core/Sum.h @@ -54,7 +54,7 @@ public: Unrolling = Cost <= UnrollingLimit ? CompleteUnrolling : NoUnrolling - }; + }; }; /*************************************************************************** @@ -62,7 +62,7 @@ public: ***************************************************************************/ /*** no vectorization ***/ - + template<typename Derived, int Start, int Length> struct ei_sum_novec_unroller { @@ -194,32 +194,23 @@ struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling> // do the vectorizable part of the sum if(size >= packetSize) { - asm("#begin"); - PacketScalar packet_res; packet_res = mat.template packet<Aligned>(0, 0); - int index; - if(Derived::IsVectorAtCompileTime) - { - for(index = packetSize; index<alignedSize ; index+=packetSize) - { - const int row = Derived::RowsAtCompileTime==1 ? 0 : index; - const int col = Derived::RowsAtCompileTime==1 ? index : 0; - packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col)); - } - } - else + int row = 0; + int col = 0; + int index = packetSize; + while (index<alignedSize) { - for(index = packetSize; index<alignedSize ; index+=packetSize) - { - // FIXME the following is not really efficient - const int row = rowMajor ? index/innerSize : index%innerSize; - const int col = rowMajor ? index%innerSize : index/innerSize; + row = rowMajor ? index/innerSize : index%innerSize; + col = rowMajor ? index%innerSize : index/innerSize; + int start = rowMajor ? col : row; + int end = std::min(innerSize, start+alignedSize-index); + if (end<start) getchar(); + for ( ; (rowMajor ? col : row)<end; (rowMajor ? col : row)+=packetSize) packet_res = ei_padd(packet_res, mat.template packet<Aligned>(row, col)); - } + index += (rowMajor ? col : row) - start; } res = ei_predux(packet_res); - asm("#end"); // now we must do the rest without vectorization. if(alignedSize == size) return res; |