aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/products/GeneralBlockPanelKernel.h
diff options
context:
space:
mode:
authorGravatar Konstantinos Margaritis <markos@codex.gr>2010-04-24 00:58:44 +0300
committerGravatar Konstantinos Margaritis <markos@codex.gr>2010-04-24 00:58:44 +0300
commit9337f371d2b08e83a56df394c838305b8c8642d8 (patch)
tree3863bc8ca18553ea0d7ac34e8fab3aa5eda682f4 /Eigen/src/Core/products/GeneralBlockPanelKernel.h
parent5acf46bd12edb79e9a83c42810654dec88227726 (diff)
(proper commit this time)
replaced _mm_prefetch in GeneralBlockPanelKernel.h, with ei_prefetch() inline function. Implemented NEON and AltiVec versions, copied SSE version over from GeneralBlockPanelKernel.h. Also in GCC case (or rather !_MSC_VER) it's implemented using __builtin_prefetch(). NEON managed to give a small but welcome boost, 0.88GFLOPS -> 0.91GFLOPS.
Diffstat (limited to 'Eigen/src/Core/products/GeneralBlockPanelKernel.h')
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h34
1 files changed, 10 insertions, 24 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 5e219e077..bc697cef5 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -117,9 +117,7 @@ struct ei_gebp_kernel
for(int i=0; i<peeled_mc; i+=mr)
{
const Scalar* blA = &blockA[i*strideA+offsetA*mr];
- #ifdef EIGEN_VECTORIZE_SSE
- _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
- #endif
+ ei_prefetch(&blA[0]);
// TODO move the res loads to the stores
@@ -139,12 +137,10 @@ struct ei_gebp_kernel
Scalar* r2 = r1 + resStride;
Scalar* r3 = r2 + resStride;
- #ifdef EIGEN_VECTORIZE_SSE
- _mm_prefetch((const char*)(r0+16), _MM_HINT_T0);
- _mm_prefetch((const char*)(r1+16), _MM_HINT_T0);
- _mm_prefetch((const char*)(r2+16), _MM_HINT_T0);
- _mm_prefetch((const char*)(r3+16), _MM_HINT_T0);
- #endif
+ ei_prefetch(r0+16);
+ ei_prefetch(r1+16);
+ ei_prefetch(r2+16);
+ ei_prefetch(r3+16);
// performs "inner" product
// TODO let's check wether the folowing peeled loop could not be
@@ -334,9 +330,7 @@ struct ei_gebp_kernel
{
int i = peeled_mc;
const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize];
- #ifdef EIGEN_VECTORIZE_SSE
- _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
- #endif
+ ei_prefetch(&blA[0]);
// gets res block as register
PacketType C0, C1, C2, C3;
@@ -464,9 +458,7 @@ struct ei_gebp_kernel
for(int i=peeled_mc2; i<rows; i++)
{
const Scalar* blA = &blockA[i*strideA+offsetA];
- #ifdef EIGEN_VECTORIZE_SSE
- _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
- #endif
+ ei_prefetch(&blA[0]);
// gets a 1 x nr res block as registers
Scalar C0(0), C1(0), C2(0), C3(0);
@@ -524,9 +516,7 @@ struct ei_gebp_kernel
for(int i=0; i<peeled_mc; i+=mr)
{
const Scalar* blA = &blockA[i*strideA+offsetA*mr];
- #ifdef EIGEN_VECTORIZE_SSE
- _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
- #endif
+ ei_prefetch(&blA[0]);
// TODO move the res loads to the stores
@@ -557,9 +547,7 @@ struct ei_gebp_kernel
{
int i = peeled_mc;
const Scalar* blA = &blockA[i*strideA+offsetA*PacketSize];
- #ifdef EIGEN_VECTORIZE_SSE
- _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
- #endif
+ ei_prefetch(&blA[0]);
PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
@@ -576,9 +564,7 @@ struct ei_gebp_kernel
for(int i=peeled_mc2; i<rows; i++)
{
const Scalar* blA = &blockA[i*strideA+offsetA];
- #ifdef EIGEN_VECTORIZE_SSE
- _mm_prefetch((const char*)(&blA[0]), _MM_HINT_T0);
- #endif
+ ei_prefetch(&blA[0]);
// gets a 1 x 1 res block as registers
Scalar C0(0);