diff options
author | Benoit Jacob <jacob.benoit.1@gmail.com> | 2008-06-26 16:06:41 +0000 |
---|---|---|
committer | Benoit Jacob <jacob.benoit.1@gmail.com> | 2008-06-26 16:06:41 +0000 |
commit | 25ba9f377c97968923cd654d419fa8ce260f114d (patch) | |
tree | 52003e7b8896a534603ecd161b31262aa8761fed /bench/benchVecAdd.cpp | |
parent | 5b0da4b778d8f51e21dd7c35f6f7397c38c6be03 (diff) |
* add bench/benchVecAdd.cpp by Gael, fix crash (ei_pload on non-aligned)
* introduce packet(int), make use of it in linear vectorized paths
--> completely fixes the slowdown noticed in benchVecAdd.
* generalize coeff(int) to linear-access xprs
* clarify the access flag bits
* rework api dox in Coeffs.h and util/Constants.h
* improve certain expressions's flags, allowing more vectorization
* fix bug in Block: start(int) and end(int) returned dyn*dyn size
* fix bug in Block: just because the Eval type has packet access
doesn't imply the block xpr should have it too.
Diffstat (limited to 'bench/benchVecAdd.cpp')
-rw-r--r-- | bench/benchVecAdd.cpp | 134 |
1 files changed, 134 insertions, 0 deletions
diff --git a/bench/benchVecAdd.cpp b/bench/benchVecAdd.cpp new file mode 100644 index 000000000..aa211dce0 --- /dev/null +++ b/bench/benchVecAdd.cpp @@ -0,0 +1,134 @@ + +#include <Eigen/Core> +#include <bench/BenchTimer.h> +using namespace Eigen; + +#ifndef SIZE +#define SIZE 50 +#endif + +#ifndef REPEAT +#define REPEAT 10000 +#endif + +typedef float Scalar; + +__attribute__ ((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size); +__attribute__ ((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c); +__attribute__ ((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c); + +int main(int argc, char* argv[]) +{ + int size = SIZE * 8; + int size2 = size * size; + Scalar* a = ei_aligned_malloc<Scalar>(size2); + Scalar* b = ei_aligned_malloc<Scalar>(size2); + Scalar* c = ei_aligned_malloc<Scalar>(size2); + + for (int i=0; i<size; ++i) + { + a[i] = b[i] = c[i] = 0; + } + + BenchTimer timer; + + timer.reset(); + for (int k=0; k<3; ++k) + { + timer.start(); + benchVec(a, b, c, size2); + timer.stop(); + } + std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n"; + + for (int innersize = size; innersize>2 ; --innersize) + { + if (size2%innersize==0) + { + int outersize = size2/innersize; + MatrixXf ma = MatrixXf::map(a, innersize, outersize ); + MatrixXf mb = MatrixXf::map(b, innersize, outersize ); + MatrixXf mc = MatrixXf::map(c, innersize, outersize ); + timer.reset(); + for (int k=0; k<3; ++k) + { + timer.start(); + benchVec(ma, mb, mc); + timer.stop(); + } + std::cout << innersize << " x " << outersize << " " << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n"; + } + } + + VectorXf va = VectorXf::map(a, size2); + VectorXf vb = VectorXf::map(b, size2); + VectorXf vc = VectorXf::map(c, size2); + timer.reset(); + for (int k=0; k<3; ++k) + { + timer.start(); + benchVec(va, vb, vc); + timer.stop(); + } + std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n"; + + return 0; +} + +void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c) +{ + for (int k=0; k<REPEAT; ++k) + a = a + b; +} + +void benchVec(VectorXf& a, VectorXf& b, VectorXf& c) +{ + for (int k=0; k<REPEAT; ++k) + a = a + b; +} + +void benchVec(Scalar* a, Scalar* b, Scalar* c, int size) +{ + typedef ei_packet_traits<Scalar>::type PacketScalar; + const int PacketSize = ei_packet_traits<Scalar>::size; + PacketScalar a0, a1, a2, a3, b0, b1, b2, b3; + for (int k=0; k<REPEAT; ++k) + for (int i=0; i<size; i+=PacketSize*8) + { + a0 = ei_pload(&a[i]); + b0 = ei_pload(&b[i]); + a1 = ei_pload(&a[i+1*PacketSize]); + b1 = ei_pload(&b[i+1*PacketSize]); + a2 = ei_pload(&a[i+2*PacketSize]); + b2 = ei_pload(&b[i+2*PacketSize]); + a3 = ei_pload(&a[i+3*PacketSize]); + b3 = ei_pload(&b[i+3*PacketSize]); + ei_pstore(&a[i], ei_padd(a0, b0)); + a0 = ei_pload(&a[i+4*PacketSize]); + b0 = ei_pload(&b[i+4*PacketSize]); + + ei_pstore(&a[i+1*PacketSize], ei_padd(a1, b1)); + a1 = ei_pload(&a[i+5*PacketSize]); + b1 = ei_pload(&b[i+5*PacketSize]); + + ei_pstore(&a[i+2*PacketSize], ei_padd(a2, b2)); + a2 = ei_pload(&a[i+6*PacketSize]); + b2 = ei_pload(&b[i+6*PacketSize]); + + ei_pstore(&a[i+3*PacketSize], ei_padd(a3, b3)); + a3 = ei_pload(&a[i+7*PacketSize]); + b3 = ei_pload(&b[i+7*PacketSize]); + + ei_pstore(&a[i+4*PacketSize], ei_padd(a0, b0)); + ei_pstore(&a[i+5*PacketSize], ei_padd(a1, b1)); + ei_pstore(&a[i+6*PacketSize], ei_padd(a2, b2)); + ei_pstore(&a[i+7*PacketSize], ei_padd(a3, b3)); + +// ei_pstore(&a[i+2*PacketSize], ei_padd(ei_pload(&a[i+2*PacketSize]), ei_pload(&b[i+2*PacketSize]))); +// ei_pstore(&a[i+3*PacketSize], ei_padd(ei_pload(&a[i+3*PacketSize]), ei_pload(&b[i+3*PacketSize]))); +// ei_pstore(&a[i+4*PacketSize], ei_padd(ei_pload(&a[i+4*PacketSize]), ei_pload(&b[i+4*PacketSize]))); +// ei_pstore(&a[i+5*PacketSize], ei_padd(ei_pload(&a[i+5*PacketSize]), ei_pload(&b[i+5*PacketSize]))); +// ei_pstore(&a[i+6*PacketSize], ei_padd(ei_pload(&a[i+6*PacketSize]), ei_pload(&b[i+6*PacketSize]))); +// ei_pstore(&a[i+7*PacketSize], ei_padd(ei_pload(&a[i+7*PacketSize]), ei_pload(&b[i+7*PacketSize]))); + } +} |