diff options
author | 2014-04-25 10:56:18 +0200 | |
---|---|---|
committer | 2014-04-25 10:56:18 +0200 | |
commit | 3d8d0f6269478a06f4fcbd4b838c8e9b9d7e9d62 (patch) | |
tree | 9f55e81498c1acc61ea0c10bce0ead69a71875d1 /Eigen/src/Core/products/GeneralBlockPanelKernel.h | |
parent | b0e19db1cf462a07e25429d4f04f7d8e858f670f (diff) |
Enable vectorization of pack_rhs with a column-major RHS.
Rename and generalize Kernel<*> to PacketBlock<*,N>.
Diffstat (limited to 'Eigen/src/Core/products/GeneralBlockPanelKernel.h')
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 25 |
1 files changed, 14 insertions, 11 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 60251f624..0a94f25e4 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1585,7 +1585,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj { for (Index m = 0; m < pack; m += PacketSize) { - Kernel<Packet> kernel; + PacketBlock<Packet> kernel; for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu<Packet>(&lhs(i+p+m, k)); ptranspose(kernel); for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); @@ -1675,7 +1675,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan // if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4 // { // for(; k<peeled_k; k+=PacketSize) { -// Kernel<Packet> kernel; +// PacketBlock<Packet> kernel; // for (int p = 0; p < PacketSize; ++p) { // kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]); // } @@ -1713,19 +1713,22 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan const Scalar* b1 = &rhs[(j2+1)*rhsStride]; const Scalar* b2 = &rhs[(j2+2)*rhsStride]; const Scalar* b3 = &rhs[(j2+3)*rhsStride]; + Index k=0; - if(PacketSize==4) // TODO enbale vectorized transposition for PacketSize==2 ?? + if((PacketSize%4)==0) // TODO enbale vectorized transposition for PacketSize==2 ?? { for(; k<peeled_k; k+=PacketSize) { - Kernel<Packet> kernel; - for (int p = 0; p < PacketSize; ++p) { - kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]); - } + PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel; + kernel.packet[0] = ploadu<Packet>(&b0[k]); + kernel.packet[1] = ploadu<Packet>(&b1[k]); + kernel.packet[2] = ploadu<Packet>(&b2[k]); + kernel.packet[3] = ploadu<Packet>(&b3[k]); ptranspose(kernel); - for (int p = 0; p < PacketSize; ++p) { - pstoreu(blockB+count, cj.pconj(kernel.packet[p])); - count+=PacketSize; - } + pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); + pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1])); + pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2])); + pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3])); + count+=4*PacketSize; } } for(; k<depth; k++) |