diff options
author | Gael Guennebaud <g.gael@free.fr> | 2011-02-12 14:17:52 +0100 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2011-02-12 14:17:52 +0100 |
commit | ec7409b16ea391f44965887e0cdb3865fc56c98e (patch) | |
tree | cc35d02c40adf95837ceeef7acc30673ea5986cf /Eigen | |
parent | f7e4602a40fc5425f8832fc849311fd68f24034f (diff) |
since gebp_kernel handled the scaling by alpha it used too many packets, this patch fix that.
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 80 |
1 files changed, 51 insertions, 29 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 074534861..25ca39d76 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -754,35 +754,57 @@ EIGEN_ASM_COMMENT("mybegin4"); blA += mr; } - ResPacket R0, R1, R2, R3, R4, R5, R6, R7; - ResPacket alphav = pset1<ResPacket>(alpha); - - R0 = ploadu<ResPacket>(r0); - R1 = ploadu<ResPacket>(r1); - if(nr==4) R2 = ploadu<ResPacket>(r2); - if(nr==4) R3 = ploadu<ResPacket>(r3); - R4 = ploadu<ResPacket>(r0 + ResPacketSize); - R5 = ploadu<ResPacket>(r1 + ResPacketSize); - if(nr==4) R6 = ploadu<ResPacket>(r2 + ResPacketSize); - if(nr==4) R7 = ploadu<ResPacket>(r3 + ResPacketSize); - - traits.acc(C0, alphav, R0); - traits.acc(C1, alphav, R1); - if(nr==4) traits.acc(C2, alphav, R2); - if(nr==4) traits.acc(C3, alphav, R3); - traits.acc(C4, alphav, R4); - traits.acc(C5, alphav, R5); - if(nr==4) traits.acc(C6, alphav, R6); - if(nr==4) traits.acc(C7, alphav, R7); - - pstoreu(r0, R0); - pstoreu(r1, R1); - if(nr==4) pstoreu(r2, R2); - if(nr==4) pstoreu(r3, R3); - pstoreu(r0 + ResPacketSize, R4); - pstoreu(r1 + ResPacketSize, R5); - if(nr==4) pstoreu(r2 + ResPacketSize, R6); - if(nr==4) pstoreu(r3 + ResPacketSize, R7); + if(nr==4) + { + ResPacket R0, R1, R2, R3, R4, R5, R6; + ResPacket alphav = pset1<ResPacket>(alpha); + + R0 = ploadu<ResPacket>(r0); + R1 = ploadu<ResPacket>(r1); + R2 = ploadu<ResPacket>(r2); + R3 = ploadu<ResPacket>(r3); + R4 = ploadu<ResPacket>(r0 + ResPacketSize); + R5 = ploadu<ResPacket>(r1 + ResPacketSize); + R6 = ploadu<ResPacket>(r2 + ResPacketSize); + traits.acc(C0, alphav, R0); + pstoreu(r0, R0); + R0 = ploadu<ResPacket>(r3 + ResPacketSize); + + traits.acc(C1, alphav, R1); + traits.acc(C2, alphav, R2); + traits.acc(C3, alphav, R3); + traits.acc(C4, alphav, R4); + traits.acc(C5, alphav, R5); + traits.acc(C6, alphav, R6); + traits.acc(C7, alphav, R0); + + pstoreu(r1, R1); + pstoreu(r2, R2); + pstoreu(r3, R3); + pstoreu(r0 + ResPacketSize, R4); + pstoreu(r1 + ResPacketSize, R5); + pstoreu(r2 + ResPacketSize, R6); + pstoreu(r3 + ResPacketSize, R0); + } + else + { + ResPacket R0, R1, R4; + ResPacket alphav = pset1<ResPacket>(alpha); + + R0 = ploadu<ResPacket>(r0); + R1 = ploadu<ResPacket>(r1); + R4 = ploadu<ResPacket>(r0 + ResPacketSize); + traits.acc(C0, alphav, R0); + pstoreu(r0, R0); + R0 = ploadu<ResPacket>(r1 + ResPacketSize); + traits.acc(C1, alphav, R1); + traits.acc(C4, alphav, R4); + traits.acc(C5, alphav, R0); + pstoreu(r1, R1); + pstoreu(r0 + ResPacketSize, R4); + pstoreu(r1 + ResPacketSize, R0); + } + } if(rows-peeled_mc>=LhsProgress) |