aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2011-02-12 14:17:52 +0100
committerGravatar Gael Guennebaud <g.gael@free.fr>2011-02-12 14:17:52 +0100
commitec7409b16ea391f44965887e0cdb3865fc56c98e (patch)
treecc35d02c40adf95837ceeef7acc30673ea5986cf /Eigen
parentf7e4602a40fc5425f8832fc849311fd68f24034f (diff)
since gebp_kernel handled the scaling by alpha it used too many packets, this patch fix that.
Diffstat (limited to 'Eigen')
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h80
1 files changed, 51 insertions, 29 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 074534861..25ca39d76 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -754,35 +754,57 @@ EIGEN_ASM_COMMENT("mybegin4");
blA += mr;
}
- ResPacket R0, R1, R2, R3, R4, R5, R6, R7;
- ResPacket alphav = pset1<ResPacket>(alpha);
-
- R0 = ploadu<ResPacket>(r0);
- R1 = ploadu<ResPacket>(r1);
- if(nr==4) R2 = ploadu<ResPacket>(r2);
- if(nr==4) R3 = ploadu<ResPacket>(r3);
- R4 = ploadu<ResPacket>(r0 + ResPacketSize);
- R5 = ploadu<ResPacket>(r1 + ResPacketSize);
- if(nr==4) R6 = ploadu<ResPacket>(r2 + ResPacketSize);
- if(nr==4) R7 = ploadu<ResPacket>(r3 + ResPacketSize);
-
- traits.acc(C0, alphav, R0);
- traits.acc(C1, alphav, R1);
- if(nr==4) traits.acc(C2, alphav, R2);
- if(nr==4) traits.acc(C3, alphav, R3);
- traits.acc(C4, alphav, R4);
- traits.acc(C5, alphav, R5);
- if(nr==4) traits.acc(C6, alphav, R6);
- if(nr==4) traits.acc(C7, alphav, R7);
-
- pstoreu(r0, R0);
- pstoreu(r1, R1);
- if(nr==4) pstoreu(r2, R2);
- if(nr==4) pstoreu(r3, R3);
- pstoreu(r0 + ResPacketSize, R4);
- pstoreu(r1 + ResPacketSize, R5);
- if(nr==4) pstoreu(r2 + ResPacketSize, R6);
- if(nr==4) pstoreu(r3 + ResPacketSize, R7);
+ if(nr==4)
+ {
+ ResPacket R0, R1, R2, R3, R4, R5, R6;
+ ResPacket alphav = pset1<ResPacket>(alpha);
+
+ R0 = ploadu<ResPacket>(r0);
+ R1 = ploadu<ResPacket>(r1);
+ R2 = ploadu<ResPacket>(r2);
+ R3 = ploadu<ResPacket>(r3);
+ R4 = ploadu<ResPacket>(r0 + ResPacketSize);
+ R5 = ploadu<ResPacket>(r1 + ResPacketSize);
+ R6 = ploadu<ResPacket>(r2 + ResPacketSize);
+ traits.acc(C0, alphav, R0);
+ pstoreu(r0, R0);
+ R0 = ploadu<ResPacket>(r3 + ResPacketSize);
+
+ traits.acc(C1, alphav, R1);
+ traits.acc(C2, alphav, R2);
+ traits.acc(C3, alphav, R3);
+ traits.acc(C4, alphav, R4);
+ traits.acc(C5, alphav, R5);
+ traits.acc(C6, alphav, R6);
+ traits.acc(C7, alphav, R0);
+
+ pstoreu(r1, R1);
+ pstoreu(r2, R2);
+ pstoreu(r3, R3);
+ pstoreu(r0 + ResPacketSize, R4);
+ pstoreu(r1 + ResPacketSize, R5);
+ pstoreu(r2 + ResPacketSize, R6);
+ pstoreu(r3 + ResPacketSize, R0);
+ }
+ else
+ {
+ ResPacket R0, R1, R4;
+ ResPacket alphav = pset1<ResPacket>(alpha);
+
+ R0 = ploadu<ResPacket>(r0);
+ R1 = ploadu<ResPacket>(r1);
+ R4 = ploadu<ResPacket>(r0 + ResPacketSize);
+ traits.acc(C0, alphav, R0);
+ pstoreu(r0, R0);
+ R0 = ploadu<ResPacket>(r1 + ResPacketSize);
+ traits.acc(C1, alphav, R1);
+ traits.acc(C4, alphav, R4);
+ traits.acc(C5, alphav, R0);
+ pstoreu(r1, R1);
+ pstoreu(r0 + ResPacketSize, R4);
+ pstoreu(r1 + ResPacketSize, R0);
+ }
+
}
if(rows-peeled_mc>=LhsProgress)