aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--Eigen/Core1
-rw-r--r--Eigen/src/Core/EvalOMP.h118
-rw-r--r--Eigen/src/Core/ForwardDeclarations.h1
-rw-r--r--Eigen/src/Core/MatrixBase.h1
-rw-r--r--bench/BenchTimer.h75
-rw-r--r--bench/BenchUtil.h28
-rw-r--r--bench/README.txt55
-rw-r--r--bench/basicbench.cxxlist12
-rw-r--r--bench/basicbenchmark.cpp46
-rw-r--r--bench/basicbenchmark.h59
-rwxr-xr-xbench/bench_multi_compilers.sh28
-rwxr-xr-xbench/bench_unrolling (renamed from doc/bench_unrolling)0
-rw-r--r--bench/benchmark.cpp (renamed from doc/benchmark.cpp)0
-rw-r--r--bench/benchmarkX.cpp (renamed from doc/benchmarkX.cpp)2
-rwxr-xr-xbench/benchmark_suite (renamed from doc/benchmark_suite)0
-rw-r--r--bench/ompbench.cxxlist7
-rw-r--r--bench/ompbenchmark.cpp81
-rw-r--r--doc/Mainpage.dox2
18 files changed, 514 insertions, 2 deletions
diff --git a/Eigen/Core b/Eigen/Core
index e52ee96e9..f2b3639ca 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -40,6 +40,7 @@ namespace Eigen {
#include "src/Core/IO.h"
#include "src/Core/Swap.h"
#include "src/Core/CommaInitializer.h"
+#include "src/Core/EvalOMP.h"
} // namespace Eigen
diff --git a/Eigen/src/Core/EvalOMP.h b/Eigen/src/Core/EvalOMP.h
new file mode 100644
index 000000000..01f78b0c8
--- /dev/null
+++ b/Eigen/src/Core/EvalOMP.h
@@ -0,0 +1,118 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob@math.jussieu.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_EVAL_OMP_H
+#define EIGEN_EVAL_OMP_H
+
+/** \class EvalOMP
+ *
+ * \brief Parallel evaluation of an expression using OpenMP
+ *
+ * The template parameter Expression is the type of the expression that we are evaluating.
+ *
+ * This class is the return type of MatrixBase::evalOMP() and most of the time this is the
+ * only way it is used.
+ *
+ * Note that if OpenMP is not enabled, then this class is equivalent to Eval.
+ *
+ * \sa MatrixBase::evalOMP(), class Eval, MatrixBase::eval()
+ */
+template<typename ExpressionType> class EvalOMP : NoOperatorEquals,
+ public Matrix< typename ExpressionType::Scalar,
+ ExpressionType::Traits::RowsAtCompileTime,
+ ExpressionType::Traits::ColsAtCompileTime,
+ EIGEN_DEFAULT_MATRIX_STORAGE_ORDER,
+ ExpressionType::Traits::MaxRowsAtCompileTime,
+ ExpressionType::Traits::MaxColsAtCompileTime>
+{
+ public:
+ typedef typename ExpressionType::Scalar Scalar;
+
+ /** The actual matrix type to evaluate to. This type can be used independently
+ * of the rest of this class to get the actual matrix type to evaluate and store
+ * the value of an expression.
+ */
+ typedef Matrix<Scalar,
+ ExpressionType::Traits::RowsAtCompileTime,
+ ExpressionType::Traits::ColsAtCompileTime,
+ EIGEN_DEFAULT_MATRIX_STORAGE_ORDER,
+ ExpressionType::Traits::MaxRowsAtCompileTime,
+ ExpressionType::Traits::MaxColsAtCompileTime> MatrixType;
+
+ #ifdef _OPENMP
+ explicit EvalOMP(const ExpressionType& other)
+ : MatrixType(other.rows(), other.cols())
+ {
+ #ifdef __INTEL_COMPILER
+ #pragma omp parallel default(none) shared(other)
+ #else
+ #pragma omp parallel default(none)
+ #endif
+ {
+ if (this->cols()>this->rows())
+ {
+ #pragma omp for
+ for(int j = 0; j < this->cols(); j++)
+ for(int i = 0; i < this->rows(); i++)
+ this->coeffRef(i, j) = other.coeff(i, j);
+ }
+ else
+ {
+ #pragma omp for
+ for(int i = 0; i < this->rows(); i++)
+ for(int j = 0; j < this->cols(); j++)
+ this->coeffRef(i, j) = other.coeff(i, j);
+ }
+ }
+ }
+ #else
+ explicit EvalOMP(const ExpressionType& other) : MatrixType(other) {}
+ #endif
+};
+
+/** Evaluates *this in a parallel fashion using OpenMP and returns the obtained matrix.
+ *
+ * Of course, it only makes sense to call this function for complex expressions, and/or
+ * large matrices (>32x32), \b and if there is no outer loop which can be parallelized.
+ *
+ * It is the responsibility of the user manage the OpenMP parameters, for instance:
+ * \code
+ * #include <omp.h>
+ * // ...
+ * omp_set_num_threads(omp_get_num_procs());
+ * \endcode
+ * You also need to enable OpenMP on your compiler (e.g., -fopenmp) during both compilation and linking.
+ *
+ * Note that if OpenMP is not enabled, then evalOMP() is equivalent to eval().
+ *
+ * \sa class EvalOMP, eval()
+ */
+template<typename Scalar, typename Derived>
+const EvalOMP<Derived> MatrixBase<Scalar, Derived>::evalOMP() const
+{
+ return EvalOMP<Derived>(*static_cast<const Derived*>(this));
+}
+
+#endif // EIGEN_EVAL_OMP_H
diff --git a/Eigen/src/Core/ForwardDeclarations.h b/Eigen/src/Core/ForwardDeclarations.h
index aa9c4a052..dce88336e 100644
--- a/Eigen/src/Core/ForwardDeclarations.h
+++ b/Eigen/src/Core/ForwardDeclarations.h
@@ -44,6 +44,7 @@ template<typename MatrixType> class DiagonalCoeffs;
template<typename MatrixType> class Identity;
template<typename MatrixType> class Map;
template<typename Derived> class Eval;
+template<typename Derived> class EvalOMP;
struct ScalarProductOp;
struct ScalarQuotientOp;
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index b0b5a50a3..ccec292f6 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -359,6 +359,7 @@ template<typename Scalar, typename Derived> class MatrixBase
/// \name special functions
//@{
const Eval<Derived> eval() const EIGEN_ALWAYS_INLINE;
+ const EvalOMP<Derived> evalOMP() const EIGEN_ALWAYS_INLINE;
template<typename CustomUnaryOp>
const CwiseUnaryOp<CustomUnaryOp, Derived> cwise(const CustomUnaryOp& func = CustomUnaryOp()) const;
diff --git a/bench/BenchTimer.h b/bench/BenchTimer.h
new file mode 100644
index 000000000..e86c6ce13
--- /dev/null
+++ b/bench/BenchTimer.h
@@ -0,0 +1,75 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob@math.jussieu.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_BENCH_TIMER_H
+#define EIGEN_BENCH_TIMER_H
+
+#include <sys/time.h>
+#include <unistd.h>
+#include <cstdlib>
+
+namespace Eigen
+{
+
+/** Elapsed time timer keeping the best try.
+ */
+class BenchTimer
+{
+public:
+
+ BenchTimer() : m_best(1e99) {}
+
+ ~BenchTimer() {}
+
+ inline void start(void) {m_start = getTime();}
+ inline void stop(void)
+ {
+ m_best = std::min(m_best, getTime() - m_start);
+ }
+
+ /** Return the best elapsed time.
+ */
+ inline double value(void)
+ {
+ return m_best;
+ }
+
+ static inline double getTime(void)
+ {
+ struct timeval tv;
+ struct timezone tz;
+ gettimeofday(&tv, &tz);
+ return (double)tv.tv_sec + 1.e-6 * (double)tv.tv_usec;
+ }
+
+protected:
+
+ double m_best, m_start;
+
+};
+
+}
+
+#endif // EIGEN_BENCH_TIMER_H
diff --git a/bench/BenchUtil.h b/bench/BenchUtil.h
new file mode 100644
index 000000000..bb3c4611c
--- /dev/null
+++ b/bench/BenchUtil.h
@@ -0,0 +1,28 @@
+
+#include <Eigen/Core>
+#include "BenchTimer.h"
+
+using namespace std;
+USING_PART_OF_NAMESPACE_EIGEN
+
+#include <boost/preprocessor/repetition/enum_params.hpp>
+#include <boost/preprocessor/repetition.hpp>
+#include <boost/preprocessor/seq.hpp>
+#include <boost/preprocessor/array.hpp>
+#include <boost/preprocessor/arithmetic.hpp>
+#include <boost/preprocessor/comparison.hpp>
+#include <boost/preprocessor/punctuation.hpp>
+#include <boost/preprocessor/punctuation/comma.hpp>
+#include <boost/preprocessor/stringize.hpp>
+
+template<typename MatrixType> void initMatrix_random(MatrixType& mat) __attribute__((noinline));
+template<typename MatrixType> void initMatrix_random(MatrixType& mat)
+{
+ mat.setRandom();// = MatrixType::random(mat.rows(), mat.cols());
+}
+
+template<typename MatrixType> void initMatrix_identity(MatrixType& mat) __attribute__((noinline));
+template<typename MatrixType> void initMatrix_identity(MatrixType& mat)
+{
+ mat.setIdentity();
+}
diff --git a/bench/README.txt b/bench/README.txt
new file mode 100644
index 000000000..39831ae8a
--- /dev/null
+++ b/bench/README.txt
@@ -0,0 +1,55 @@
+
+This folder contains a couple of benchmark utities and Eigen benchmarks.
+
+****************************
+* bench_multi_compilers.sh *
+****************************
+
+This script allows to run a benchmark on a set of different compilers/compiler options.
+It takes two arguments:
+ - a file defining the list of the compilers with their options
+ - the .cpp file of the benchmark
+
+Examples:
+
+$ ./bench_multi_compilers.sh basicbench.cxxlist basicbenchmark.cpp
+
+ g++-4.1 -O3 -DNDEBUG -finline-limit=10000
+ 3d-3x3 / 4d-4x4 / Xd-4x4 / Xd-20x20 /
+ 0.271102 0.131416 0.422322 0.198633
+ 0.201658 0.102436 0.397566 0.207282
+
+ g++-4.2 -O3 -DNDEBUG -finline-limit=10000
+ 3d-3x3 / 4d-4x4 / Xd-4x4 / Xd-20x20 /
+ 0.107805 0.0890579 0.30265 0.161843
+ 0.127157 0.0712581 0.278341 0.191029
+
+ g++-4.3 -O3 -DNDEBUG -finline-limit=10000
+ 3d-3x3 / 4d-4x4 / Xd-4x4 / Xd-20x20 /
+ 0.134318 0.105291 0.3704 0.180966
+ 0.137703 0.0732472 0.31225 0.202204
+
+ icpc -fast -DNDEBUG -fno-exceptions -no-inline-max-size
+ 3d-3x3 / 4d-4x4 / Xd-4x4 / Xd-20x20 /
+ 0.226145 0.0941319 0.371873 0.159433
+ 0.109302 0.0837538 0.328102 0.173891
+
+
+$ ./bench_multi_compilers.sh ompbench.cxxlist ompbenchmark.cpp
+
+ g++-4.2 -O3 -DNDEBUG -finline-limit=10000 -fopenmp
+ double, fixed-size 4x4: 0.00165105s 0.0778739s
+ double, 32x32: 0.0654769s 0.075289s => x0.869674 (2)
+ double, 128x128: 0.054148s 0.0419669s => x1.29025 (2)
+ double, 512x512: 0.913799s 0.428533s => x2.13239 (2)
+ double, 1024x1024: 14.5972s 9.3542s => x1.5605 (2)
+
+ icpc -fast -DNDEBUG -fno-exceptions -no-inline-max-size -openmp
+ double, fixed-size 4x4: 0.000589848s 0.019949s
+ double, 32x32: 0.0682781s 0.0449722s => x1.51823 (2)
+ double, 128x128: 0.0547509s 0.0435519s => x1.25714 (2)
+ double, 512x512: 0.829436s 0.424438s => x1.9542 (2)
+ double, 1024x1024: 14.5243s 10.7735s => x1.34815 (2)
+
+
+
diff --git a/bench/basicbench.cxxlist b/bench/basicbench.cxxlist
new file mode 100644
index 000000000..93266aaf2
--- /dev/null
+++ b/bench/basicbench.cxxlist
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+CLIST[((g++))]="g++-4.1 -O3 -DNDEBUG"
+CLIST[((g++))]="g++-4.1 -O3 -DNDEBUG -finline-limit=20000"
+
+CLIST[((g++))]="g++-4.2 -O3 -DNDEBUG"
+CLIST[((g++))]="g++-4.2 -O3 -DNDEBUG -finline-limit=20000"
+
+CLIST[((g++))]="g++-4.3 -O3 -DNDEBUG"
+CLIST[((g++))]="g++-4.3 -O3 -DNDEBUG -finline-limit=20000"
+
+CLIST[((g++))]="icpc -fast -DNDEBUG -fno-exceptions -no-inline-max-size" \ No newline at end of file
diff --git a/bench/basicbenchmark.cpp b/bench/basicbenchmark.cpp
new file mode 100644
index 000000000..c44ed4514
--- /dev/null
+++ b/bench/basicbenchmark.cpp
@@ -0,0 +1,46 @@
+
+#include "BenchUtil.h"
+#include "basicbenchmark.h"
+
+int main(int argc, char *argv[])
+{
+ // disbale floating point exceptions
+ // this leads to more stable bench results
+ // (this is done by default by ICC)
+ #ifndef __INTEL_COMPILER
+ {
+ int aux;
+ asm(
+ "stmxcsr %[aux] \n\t"
+ "orl $32832, %[aux] \n\t"
+ "ldmxcsr %[aux] \n\t"
+ : : [aux] "m" (aux));
+ }
+ #endif
+
+ // this is the list of matrix type and size we want to bench:
+ // ((suffix) (matrix size) (number of iterations))
+ #define MODES ((3d)(3)(4000000)) ((4d)(4)(1000000)) ((Xd)(4)(1000000)) ((Xd)(20)(10000))
+// #define MODES ((Xd)(20)(10000))
+
+ #define _GENERATE_HEADER(R,ARG,EL) << BOOST_PP_STRINGIZE(BOOST_PP_SEQ_HEAD(EL)) << "-" \
+ << BOOST_PP_STRINGIZE(BOOST_PP_SEQ_ELEM(1,EL)) << "x" \
+ << BOOST_PP_STRINGIZE(BOOST_PP_SEQ_ELEM(1,EL)) << " / "
+
+ std::cout BOOST_PP_SEQ_FOR_EACH(_GENERATE_HEADER, ~, MODES ) << endl;
+
+ const int tries = 10;
+
+ #define _RUN_BENCH(R,ARG,EL) \
+ std::cout << ARG( \
+ BOOST_PP_CAT(Matrix, BOOST_PP_SEQ_HEAD(EL)) (\
+ BOOST_PP_SEQ_ELEM(1,EL),BOOST_PP_SEQ_ELEM(1,EL)), BOOST_PP_SEQ_ELEM(2,EL), tries) \
+ << " ";
+
+ BOOST_PP_SEQ_FOR_EACH(_RUN_BENCH, benchBasic<LazyEval>, MODES );
+ std::cout << endl;
+ BOOST_PP_SEQ_FOR_EACH(_RUN_BENCH, benchBasic<EarlyEval>, MODES );
+ std::cout << endl;
+
+ return 0;
+}
diff --git a/bench/basicbenchmark.h b/bench/basicbenchmark.h
new file mode 100644
index 000000000..60e1c0258
--- /dev/null
+++ b/bench/basicbenchmark.h
@@ -0,0 +1,59 @@
+
+enum {LazyEval, EarlyEval, OmpEval};
+
+template<int Mode, typename MatrixType>
+void benchBasic_loop(const MatrixType& I, MatrixType& m, int iterations) __attribute__((noinline));
+
+template<int Mode, typename MatrixType>
+void benchBasic_loop(const MatrixType& I, MatrixType& m, int iterations)
+{
+ for(int a = 0; a < iterations; a++)
+ {
+ if (Mode==LazyEval)
+ {
+ asm("#begin_bench_loop LazyEval");
+ if (MatrixType::Traits::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize");
+ m = (I + 0.00005 * (m + m.lazyProduct(m))).eval();
+ }
+ else if (Mode==OmpEval)
+ {
+ asm("#begin_bench_loop OmpEval");
+ if (MatrixType::Traits::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize");
+ m = (I + 0.00005 * (m + m.lazyProduct(m))).evalOMP();
+ }
+ else
+ {
+ asm("#begin_bench_loop EarlyEval");
+ if (MatrixType::Traits::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize");
+ m = I + 0.00005 * (m + m * m);
+ }
+ asm("#end_bench_loop");
+ }
+}
+
+template<int Mode, typename MatrixType>
+double benchBasic(const MatrixType& mat, int size, int tries) __attribute__((noinline));
+
+template<int Mode, typename MatrixType>
+double benchBasic(const MatrixType& mat, int iterations, int tries)
+{
+ const int rows = mat.rows();
+ const int cols = mat.cols();
+
+ MatrixType I(rows,cols);
+ MatrixType m(rows,cols);
+
+ initMatrix_identity(I);
+
+ Eigen::BenchTimer timer;
+ for(uint t=0; t<tries; ++t)
+ {
+ initMatrix_random(m);
+ timer.start();
+ benchBasic_loop<Mode>(I, m, iterations);
+ timer.stop();
+ cerr << m;
+ }
+ return timer.value();
+};
+
diff --git a/bench/bench_multi_compilers.sh b/bench/bench_multi_compilers.sh
new file mode 100755
index 000000000..ce5586fb9
--- /dev/null
+++ b/bench/bench_multi_compilers.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+if (($# < 2)); then
+ echo "Usage: $0 compilerlist.txt benchfile.cpp"
+else
+
+compilerlist=$1
+benchfile=$2
+
+g=0
+source $compilerlist
+
+# for each compiler, compile benchfile and run the benchmark
+for (( i=0 ; i<g ; ++i )) ; do
+ # check the compiler exists
+ compiler=`echo ${CLIST[$i]} | cut -d " " -f 1`
+ if [ -e `which $compiler` ]; then
+ echo "${CLIST[$i]}"
+# echo "${CLIST[$i]} $benchfile -I.. -o bench~"
+ if [ -e ./.bench ] ; then rm .bench; fi
+ ${CLIST[$i]} $benchfile -I.. -o .bench && ./.bench 2> /dev/null
+ echo ""
+ else
+ echo "compiler not found: $compiler"
+ fi
+done
+
+fi
diff --git a/doc/bench_unrolling b/bench/bench_unrolling
index 4af791412..4af791412 100755
--- a/doc/bench_unrolling
+++ b/bench/bench_unrolling
diff --git a/doc/benchmark.cpp b/bench/benchmark.cpp
index 0d95a5043..0d95a5043 100644
--- a/doc/benchmark.cpp
+++ b/bench/benchmark.cpp
diff --git a/doc/benchmarkX.cpp b/bench/benchmarkX.cpp
index 1c8b6b803..09173e1ed 100644
--- a/doc/benchmarkX.cpp
+++ b/bench/benchmarkX.cpp
@@ -13,7 +13,7 @@ int main(int argc, char *argv[])
{
m(i,j) = 0.1 * (i+20*j);
}
- for(int a = 0; a < 1000000; a++)
+ for(int a = 0; a < 100000; a++)
{
m = I + 0.00005 * (m + m*m);
}
diff --git a/doc/benchmark_suite b/bench/benchmark_suite
index 9ddfccbf6..9ddfccbf6 100755
--- a/doc/benchmark_suite
+++ b/bench/benchmark_suite
diff --git a/bench/ompbench.cxxlist b/bench/ompbench.cxxlist
new file mode 100644
index 000000000..fc6681d33
--- /dev/null
+++ b/bench/ompbench.cxxlist
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+CLIST[((g++))]="g++-4.2 -O3 -DNDEBUG -finline-limit=10000 -fopenmp"
+
+# CLIST[((g++))]="g++-4.3 -O3 -DNDEBUG -finline-limit=10000 -fopenmp"
+
+CLIST[((g++))]="icpc -fast -DNDEBUG -fno-exceptions -no-inline-max-size -openmp" \ No newline at end of file
diff --git a/bench/ompbenchmark.cpp b/bench/ompbenchmark.cpp
new file mode 100644
index 000000000..ac5155cb8
--- /dev/null
+++ b/bench/ompbenchmark.cpp
@@ -0,0 +1,81 @@
+// g++ -O3 -DNDEBUG -I.. -fopenmp benchOpenMP.cpp -o benchOpenMP && ./benchOpenMP 2> /dev/null
+// icpc -fast -fno-exceptions -DNDEBUG -I.. -openmp benchOpenMP.cpp -o benchOpenMP && ./benchOpenMP 2> /dev/null
+
+#include <omp.h>
+#include "BenchUtil.h"
+#include "basicbenchmark.h"
+
+// #include <Eigen/Core>
+// #include "BenchTimer.h"
+//
+// using namespace std;
+// USING_PART_OF_NAMESPACE_EIGEN
+//
+// enum {LazyEval, EarlyEval, OmpEval};
+//
+// template<int Mode, typename MatrixType>
+// double benchSingleProc(const MatrixType& mat, int iterations, int tries) __attribute__((noinline));
+//
+// template<int Mode, typename MatrixType>
+// double benchBasic(const MatrixType& mat, int iterations, int tries)
+// {
+// const int rows = mat.rows();
+// const int cols = mat.cols();
+//
+// Eigen::BenchTimer timer;
+// for(uint t=0; t<tries; ++t)
+// {
+// MatrixType I = MatrixType::identity(rows, cols);
+// MatrixType m = MatrixType::random(rows, cols);
+//
+// timer.start();
+// for(int a = 0; a < iterations; a++)
+// {
+// if(Mode==LazyEval)
+// m = (I + 0.00005 * (m + m.lazyProduct(m))).eval();
+// else if(Mode==OmpEval)
+// m = (I + 0.00005 * (m + m.lazyProduct(m))).evalOMP();
+// else
+// m = I + 0.00005 * (m + m * m);
+// }
+// timer.stop();
+// cerr << m;
+// }
+// return timer.value();
+// };
+
+int main(int argc, char *argv[])
+{
+ // disbale floating point exceptions
+ // this leads to more stable bench results
+ {
+ int aux;
+ asm(
+ "stmxcsr %[aux] \n\t"
+ "orl $32832, %[aux] \n\t"
+ "ldmxcsr %[aux] \n\t"
+ : : [aux] "m" (aux));
+ }
+
+ // commented since the default setting is use as many threads as processors
+ //omp_set_num_threads(omp_get_num_procs());
+
+ std::cout << "double, fixed-size 4x4: "
+ << benchBasic<LazyEval>(Matrix4d(), 10000, 10) << "s "
+ << benchBasic<OmpEval>(Matrix4d(), 10000, 10) << "s \n";
+
+ #define BENCH_MATRIX(TYPE, SIZE, ITERATIONS, TRIES) {\
+ double single = benchBasic<LazyEval>(Matrix<TYPE,Eigen::Dynamic,Eigen::Dynamic>(SIZE,SIZE), ITERATIONS, TRIES); \
+ double omp = benchBasic<OmpEval> (Matrix<TYPE,Eigen::Dynamic,Eigen::Dynamic>(SIZE,SIZE), ITERATIONS, TRIES); \
+ std::cout << #TYPE << ", " << #SIZE << "x" << #SIZE << ": " << single << "s " << omp << "s " \
+ << " => x" << single/omp << " (" << omp_get_num_procs() << ")" << std::endl; \
+ }
+
+ BENCH_MATRIX(double, 32, 1000, 10);
+ BENCH_MATRIX(double, 128, 10, 10);
+ BENCH_MATRIX(double, 512, 1, 6);
+ BENCH_MATRIX(double, 1024, 1, 4);
+
+ return 0;
+}
+
diff --git a/doc/Mainpage.dox b/doc/Mainpage.dox
index fea80519d..115cb2eeb 100644
--- a/doc/Mainpage.dox
+++ b/doc/Mainpage.dox
@@ -75,7 +75,7 @@ Eigen is well tested with recent versions of GCC and ICC. Both GCC 4.2 and ICC g
For best performance, we recommend the following compilation flags:
<ul>
- <li>\b GCC: \c -O3 \c -DNDEBUG \c -finline-limit=10000 \c -falign-functions=64 </li>
+ <li>\b GCC: \c -O3 \c -DNDEBUG \c -finline-limit=10000 </li>
<li>\b ICC: \c -O3 \c -DNDEBUG \c -xT \c -ipo \c -no-prec-div \c -no-inline-max-size </li>
</ul>