diff options
-rw-r--r-- | Eigen/Core | 7 | ||||
-rw-r--r-- | Eigen/src/Core/Assign.h | 37 | ||||
-rw-r--r-- | Eigen/src/Core/Product.h | 114 | ||||
-rw-r--r-- | Eigen/src/Core/util/Macros.h | 28 | ||||
-rw-r--r-- | Eigen/src/LU/Inverse.h | 1 | ||||
-rw-r--r-- | bench/benchmarkXcwise.cpp | 20 |
6 files changed, 73 insertions, 134 deletions
diff --git a/Eigen/Core b/Eigen/Core index 6a315b09f..24dc37145 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -10,13 +10,6 @@ #endif #endif -#ifndef EIGEN_DONT_PARALLELIZE -#ifdef _OPENMP -#define EIGEN_USE_OPENMP -#include <omp.h> -#endif -#endif - #include <cstdlib> #include <cmath> #include <complex> diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h index 1b6e928d2..d0f126689 100644 --- a/Eigen/src/Core/Assign.h +++ b/Eigen/src/Core/Assign.h @@ -135,11 +135,6 @@ Derived& MatrixBase<Derived> } } -template<typename T1, typename T2> bool ei_should_parallelize_assignment(const T1& t, const T2&) -{ - return (T1::Flags & T2::Flags & LargeBit) && t.size() >= EIGEN_PARALLELIZATION_TRESHOLD; -} - template <typename Derived, typename OtherDerived> struct ei_assignment_impl<Derived, OtherDerived, false> { @@ -158,23 +153,17 @@ struct ei_assignment_impl<Derived, OtherDerived, false> { if(Derived::ColsAtCompileTime == Dynamic || Derived::RowsAtCompileTime != Dynamic) { - #define EIGEN_THE_PARALLELIZABLE_LOOP \ - for(int j = 0; j < dst.cols(); j++) \ - for(int i = 0; i < dst.rows(); i++) \ - dst.coeffRef(i, j) = src.coeff(i, j); - EIGEN_RUN_PARALLELIZABLE_LOOP(ei_should_parallelize_assignment(dst, src)) - #undef EIGEN_THE_PARALLELIZABLE_LOOP + for(int j = 0; j < dst.cols(); j++) + for(int i = 0; i < dst.rows(); i++) + dst.coeffRef(i, j) = src.coeff(i, j); } else { // traverse in row-major order // in order to allow the compiler to unroll the inner loop - #define EIGEN_THE_PARALLELIZABLE_LOOP \ - for(int i = 0; i < dst.rows(); i++) \ - for(int j = 0; j < dst.cols(); j++) \ - dst.coeffRef(i, j) = src.coeff(i, j); - EIGEN_RUN_PARALLELIZABLE_LOOP(ei_should_parallelize_assignment(dst, src)) - #undef EIGEN_THE_PARALLELIZABLE_LOOP + for(int i = 0; i < dst.rows(); i++) + for(int j = 0; j < dst.cols(); j++) + dst.coeffRef(i, j) = src.coeff(i, j); } } } @@ -199,21 +188,15 @@ struct ei_assignment_impl<Derived, OtherDerived, true> { if(OtherDerived::Flags&RowMajorBit) { - #define EIGEN_THE_PARALLELIZABLE_LOOP \ - for(int i = 0; i < dst.rows(); i++) \ - for(int j = 0; j < dst.cols(); j+=ei_packet_traits<typename Derived::Scalar>::size) \ + for(int i = 0; i < dst.rows(); i++) + for(int j = 0; j < dst.cols(); j+=ei_packet_traits<typename Derived::Scalar>::size) dst.writePacketCoeff(i, j, src.packetCoeff(i, j)); - EIGEN_RUN_PARALLELIZABLE_LOOP(ei_should_parallelize_assignment(dst, src)) - #undef EIGEN_THE_PARALLELIZABLE_LOOP } else { - #define EIGEN_THE_PARALLELIZABLE_LOOP \ - for(int j = 0; j < dst.cols(); j++) \ - for(int i = 0; i < dst.rows(); i+=ei_packet_traits<typename Derived::Scalar>::size) \ + for(int j = 0; j < dst.cols(); j++) + for(int i = 0; i < dst.rows(); i+=ei_packet_traits<typename Derived::Scalar>::size) dst.writePacketCoeff(i, j, src.packetCoeff(i, j)); - EIGEN_RUN_PARALLELIZABLE_LOOP(ei_should_parallelize_assignment(dst, src)) - #undef EIGEN_THE_PARALLELIZABLE_LOOP } } } diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index b593825f8..a49609f5c 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -280,75 +280,67 @@ void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res) const { res.setZero(); const int cols4 = m_lhs.cols() & 0xfffffffC; - const bool should_parallelize = (Flags & DestDerived::Flags & LargeBit) - && res.size() >= EIGEN_PARALLELIZATION_TRESHOLD; #ifdef EIGEN_VECTORIZE if( (Flags & VectorizableBit) && (!(Lhs::Flags & RowMajorBit)) ) - { - #define EIGEN_THE_PARALLELIZABLE_LOOP \ - for(int k=0; k<this->cols(); k++) \ - { \ - int j=0; \ - for(; j<cols4; j+=4) \ - { \ - const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k)); \ - const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k)); \ - const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k)); \ - const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k)); \ - for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size) \ - { \ - res.writePacketCoeff(i,k,\ - ei_padd( \ - res.packetCoeff(i,k), \ - ei_padd( \ - ei_padd( \ - ei_pmul(tmp0, m_lhs.packetCoeff(i,j)), \ - ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))), \ - ei_padd( \ - ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)), \ - ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3)) \ - ) \ - ) \ - ) \ - ); \ - } \ - } \ - for(; j<m_lhs.cols(); ++j) \ - { \ - const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k)); \ - for (int i=0; i<this->rows(); ++i) \ - res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j))); \ - } \ + { + for(int k=0; k<this->cols(); k++) + { + int j=0; + for(; j<cols4; j+=4) + { + const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k)); + const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k)); + const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k)); + const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k)); + for (int i=0; i<this->rows(); i+=ei_packet_traits<Scalar>::size) + { + res.writePacketCoeff(i,k,\ + ei_padd( + res.packetCoeff(i,k), + ei_padd( + ei_padd( + ei_pmul(tmp0, m_lhs.packetCoeff(i,j)), + ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))), + ei_padd( + ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)), + ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3)) + ) + ) + ) + ); + } + } + for(; j<m_lhs.cols(); ++j) + { + const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k)); + for (int i=0; i<this->rows(); ++i) + res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j))); } - EIGEN_RUN_PARALLELIZABLE_LOOP(should_parallelize) - #undef EIGEN_THE_PARALLELIZABLE_LOOP + } } else #endif // EIGEN_VECTORIZE { - #define EIGEN_THE_PARALLELIZABLE_LOOP \ - for(int k=0; k<this->cols(); ++k) \ - { \ - int j=0; \ - for(; j<cols4; j+=4) \ - { \ - const Scalar tmp0 = m_rhs.coeff(j ,k); \ - const Scalar tmp1 = m_rhs.coeff(j+1,k); \ - const Scalar tmp2 = m_rhs.coeff(j+2,k); \ - const Scalar tmp3 = m_rhs.coeff(j+3,k); \ - for (int i=0; i<this->rows(); ++i) \ - res.coeffRef(i,k) += tmp0 * m_lhs.coeff(i,j) + tmp1 * m_lhs.coeff(i,j+1) \ - + tmp2 * m_lhs.coeff(i,j+2) + tmp3 * m_lhs.coeff(i,j+3); \ - } \ - for(; j<m_lhs.cols(); ++j) \ - { \ - const Scalar tmp = m_rhs.coeff(j,k); \ - for (int i=0; i<this->rows(); ++i) \ - res.coeffRef(i,k) += tmp * m_lhs.coeff(i,j); \ - } \ + for(int k=0; k<this->cols(); ++k) + { + int j=0; + for(; j<cols4; j+=4) + { + const Scalar tmp0 = m_rhs.coeff(j ,k); + const Scalar tmp1 = m_rhs.coeff(j+1,k); + const Scalar tmp2 = m_rhs.coeff(j+2,k); + const Scalar tmp3 = m_rhs.coeff(j+3,k); + for (int i=0; i<this->rows(); ++i) + res.coeffRef(i,k) += tmp0 * m_lhs.coeff(i,j) + tmp1 * m_lhs.coeff(i,j+1) + + tmp2 * m_lhs.coeff(i,j+2) + tmp3 * m_lhs.coeff(i,j+3); } - EIGEN_RUN_PARALLELIZABLE_LOOP(should_parallelize) - #undef EIGEN_THE_PARALLELIZABLE_LOOP + for(; j<m_lhs.cols(); ++j) + { + const Scalar tmp = m_rhs.coeff(j,k); + for (int i=0; i<this->rows(); ++i) + res.coeffRef(i,k) += tmp * m_lhs.coeff(i,j); + } + } } } diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index fad046766..be5e7bba5 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -37,10 +37,6 @@ #define EIGEN_UNROLLING_LIMIT 400 #endif -#ifndef EIGEN_PARALLELIZATION_TRESHOLD -#define EIGEN_PARALLELIZATION_TRESHOLD 2000 -#endif - #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR #define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER RowMajorBit #else @@ -78,30 +74,6 @@ using Eigen::MatrixBase; #define EIGEN_ONLY_USED_FOR_DEBUG(x) #endif -#ifdef EIGEN_USE_OPENMP -# ifdef __INTEL_COMPILER -# define EIGEN_PRAGMA_OMP_PARALLEL _Pragma("omp parallel default(none) shared(other)") -# else -# define EIGEN_PRAGMA_OMP_PARALLEL _Pragma("omp parallel default(none)") -# endif -# define EIGEN_RUN_PARALLELIZABLE_LOOP(condition) \ - if(condition) \ - { \ - EIGEN_PRAGMA_OMP_PARALLEL \ - { \ - _Pragma("omp for") \ - EIGEN_THE_PARALLELIZABLE_LOOP \ - } \ - } \ - else \ - { \ - EIGEN_THE_PARALLELIZABLE_LOOP \ - } -#else // EIGEN_USE_OPENMP -# define EIGEN_RUN_PARALLELIZABLE_LOOP(condition) EIGEN_THE_PARALLELIZABLE_LOOP -#endif - - // FIXME with the always_inline attribute, // gcc 3.4.x reports the following compilation error: // Eval.h:91: sorry, unimplemented: inlining failed in call to 'const Eigen::Eval<Derived> Eigen::MatrixBase<Scalar, Derived>::eval() const' diff --git a/Eigen/src/LU/Inverse.h b/Eigen/src/LU/Inverse.h index 1d4bd9bf0..eda20e1f3 100644 --- a/Eigen/src/LU/Inverse.h +++ b/Eigen/src/LU/Inverse.h @@ -92,7 +92,6 @@ template<typename MatrixType, bool CheckExistence> class Inverse : ei_no_assignm enum { _Size = MatrixType::RowsAtCompileTime }; void _compute(const MatrixType& matrix); void _compute_in_general_case(const MatrixType& matrix); - void _compute_in_size1_case(const MatrixType& matrix); void _compute_in_size2_case(const MatrixType& matrix); void _compute_in_size3_case(const MatrixType& matrix); void _compute_in_size4_case(const MatrixType& matrix); diff --git a/bench/benchmarkXcwise.cpp b/bench/benchmarkXcwise.cpp index b2a7fc24c..9b394ff35 100644 --- a/bench/benchmarkXcwise.cpp +++ b/bench/benchmarkXcwise.cpp @@ -5,12 +5,12 @@ using namespace std; USING_PART_OF_NAMESPACE_EIGEN -#ifndef MATTYPE -#define MATTYPE MatrixXLd +#ifndef VECTYPE +#define VECTYPE VectorXLd #endif -#ifndef MATSIZE -#define MATSIZE 1000000 +#ifndef VECSIZE +#define VECSIZE 1000000 #endif #ifndef REPEAT @@ -19,16 +19,16 @@ USING_PART_OF_NAMESPACE_EIGEN int main(int argc, char *argv[]) { - MATTYPE I = MATTYPE::ones(MATSIZE,1); - MATTYPE m(MATSIZE,1); - for(int i = 0; i < MATSIZE; i++) for(int j = 0; j < 1; j++) + VECTYPE I = VECTYPE::ones(VECSIZE); + VECTYPE m(VECSIZE,1); + for(int i = 0; i < VECSIZE; i++) { - m(i,j) = 0.1 * (i+j+1)/MATSIZE/MATSIZE; + m[i] = 0.1 * i/VECSIZE; } for(int a = 0; a < REPEAT; a++) { - m = MATTYPE::ones(MATSIZE,1) + 0.00005 * (m.cwiseProduct(m) + m/4); + m = VECTYPE::ones(VECSIZE) + 0.00005 * (m.cwiseProduct(m) + m/4); } - cout << m(0,0) << endl; + cout << m[0] << endl; return 0; } |