diff options
author | Gael Guennebaud <g.gael@free.fr> | 2009-03-04 07:21:17 +0000 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2009-03-04 07:21:17 +0000 |
commit | 45136ac3b6ea63e9f8b093ab0e8dd0b9eac6452a (patch) | |
tree | 2ed6b7b6c76a8e8e2ef5466a72abc387554f135a /bench/btl | |
parent | 3288e9e168bf89737aded54c4d98470a865d6fd3 (diff) |
various update of of BTL
Diffstat (limited to 'bench/btl')
-rw-r--r-- | bench/btl/README | 8 | ||||
-rw-r--r-- | bench/btl/actions/basic_actions.hh | 2 | ||||
-rw-r--r-- | bench/btl/cmake/FindATLAS.cmake | 28 | ||||
-rw-r--r-- | bench/btl/cmake/FindGOTO.cmake | 4 | ||||
-rw-r--r-- | bench/btl/data/action_settings.txt | 26 | ||||
-rwxr-xr-x | bench/btl/data/go_mean | 40 | ||||
-rw-r--r-- | bench/btl/data/mk_mean_script.sh | 3 | ||||
-rw-r--r-- | bench/btl/generic_bench/bench_parameter.hh | 4 | ||||
-rw-r--r-- | bench/btl/generic_bench/btl.hh | 7 | ||||
-rw-r--r-- | bench/btl/generic_bench/timers/portable_perf_analyzer.hh | 9 | ||||
-rw-r--r-- | bench/btl/libs/C_BLAS/C_BLAS_interface.hh | 10 | ||||
-rw-r--r-- | bench/btl/libs/C_BLAS/main.cpp | 1 | ||||
-rw-r--r-- | bench/btl/libs/STL/STL_interface.hh | 9 | ||||
-rw-r--r-- | bench/btl/libs/STL/main.cpp | 1 | ||||
-rw-r--r-- | bench/btl/libs/eigen2/CMakeLists.txt | 2 | ||||
-rw-r--r-- | bench/btl/libs/eigen2/eigen2_interface.hh | 65 | ||||
-rw-r--r-- | bench/btl/libs/eigen2/main_vecmat.cpp | 2 | ||||
-rwxr-xr-x | bench/btl/libs/hand_vec/hand_vec_interface.hh | 131 | ||||
-rw-r--r-- | bench/btl/libs/hand_vec/main.cpp | 2 |
19 files changed, 294 insertions, 60 deletions
diff --git a/bench/btl/README b/bench/btl/README index 787002f9a..f3f5fb36f 100644 --- a/bench/btl/README +++ b/bench/btl/README @@ -43,10 +43,10 @@ Finally, if bench results already exist (the bench*.dat files) then they merges BTL_CONFIG="-a axpy:vector_matrix:trisolve:ata --overwrite" ctest -V -R eigen2 4 : Analyze the result. different data files (.dat) are produced in each libs directories. - If gnuplot is available, choose a directory name in the data directory to store the results and type - cd data - mkdir my_directory - cp ../libs/*/*.dat my_directory + If gnuplot is available, choose a directory name in the data directory to store the results and type: + $ cd data + $ mkdir my_directory + $ cp ../libs/*/*.dat my_directory Build the data utilities in this (data) directory make Then you can look the raw data, diff --git a/bench/btl/actions/basic_actions.hh b/bench/btl/actions/basic_actions.hh index 1e6e420f7..a23e58096 100644 --- a/bench/btl/actions/basic_actions.hh +++ b/bench/btl/actions/basic_actions.hh @@ -12,7 +12,7 @@ #include "action_trisolve.hh" #include "action_symv.hh" -#include "action_symm.hh" +// #include "action_symm.hh" #include "action_syr2.hh" // #include "action_lu_solve.hh" diff --git a/bench/btl/cmake/FindATLAS.cmake b/bench/btl/cmake/FindATLAS.cmake index bba350ba7..b4a984abe 100644 --- a/bench/btl/cmake/FindATLAS.cmake +++ b/bench/btl/cmake/FindATLAS.cmake @@ -15,23 +15,25 @@ find_path(ATLAS_INCLUDES find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) find_library(ATLAS_LIB atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -# find_file(ATLAS_CBLAS libcblas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -# find_library(ATLAS_CBLAS cblas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +find_file(ATLAS_CBLAS libcblas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +find_library(ATLAS_CBLAS cblas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -# find_file(ATLAS_LAPACK liblapack_atlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -# find_library(ATLAS_LAPACK lapack_atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +find_file(ATLAS_LAPACK liblapack_atlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +find_library(ATLAS_LAPACK lapack_atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -# find_file(ATLAS_LAPACK liblapack.so.3 PATHS /usr/lib/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -# find_library(ATLAS_LAPACK lapack PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +if(NOT ATLAS_LAPACK) + find_file(ATLAS_LAPACK liblapack.so.3 PATHS /usr/lib/atlas $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) + find_library(ATLAS_LAPACK lapack PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +endif(NOT ATLAS_LAPACK) -# find_file(ATLAS_F77BLAS libf77blas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -# find_library(ATLAS_F77BLAS f77blas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +find_file(ATLAS_F77BLAS libf77blas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) +find_library(ATLAS_F77BLAS f77blas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR}) -# if(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS) -set(ATLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_LAPACK} -# ${ATLAS_CBLAS} ${ATLAS_LAPACK} ${ATLAS_F77BLAS} -) -# endif(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS) +if(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS) + + set(ATLAS_LIBRARIES ${ATLAS_LAPACK} ${ATLAS_CBLAS} ${ATLAS_F77BLAS} ${ATLAS_LIB}) + +endif(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS) include(FindPackageHandleStandardArgs) find_package_handle_standard_args(ATLAS DEFAULT_MSG diff --git a/bench/btl/cmake/FindGOTO.cmake b/bench/btl/cmake/FindGOTO.cmake index b2b648b14..ad7eb3200 100644 --- a/bench/btl/cmake/FindGOTO.cmake +++ b/bench/btl/cmake/FindGOTO.cmake @@ -15,6 +15,10 @@ find_path(GOTO_INCLUDES find_file(GOTO_LIBRARIES libgotoblas.so PATHS /usr/lib $ENV{GOTODIR} ${LIB_INSTALL_DIR}) find_library(GOTO_LIBRARIES gotoblas PATHS $ENV{GOTODIR} ${LIB_INSTALL_DIR}) +if(GOTO_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX) + set(GOTO_LIBRARIES ${GOTO_LIBRARIES} "-lpthread") +endif(GOTO_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX) + include(FindPackageHandleStandardArgs) find_package_handle_standard_args(GOTO DEFAULT_MSG GOTO_INCLUDES GOTO_LIBRARIES) diff --git a/bench/btl/data/action_settings.txt b/bench/btl/data/action_settings.txt index 26557279b..5e88cee99 100644 --- a/bench/btl/data/action_settings.txt +++ b/bench/btl/data/action_settings.txt @@ -1,12 +1,14 @@ -aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:1024 -ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:1024 -atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:1024 -axpby ; "{/*1.5 Y = alpha * X + beta * Y}" ; "vector size" ; 5:1000000 -axpy ; "{/*1.5 Y += alpha * X}" ; "vector size" ; 5:1000000 -matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:1024 -matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:1024 -trisolve ; "{/*1.5 triangular solver (X = inv(L) * X)}" ; "size" ; 4:1024 -cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:1024 -lu_decomp ; "{/*1.5 LU decomposition}" ; "matrix size" ; 4:1024 -tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:1024 -hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:1024
\ No newline at end of file +aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:2048 +ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:2048 +atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:2048 +axpby ; "{/*1.5 Y = alpha X + beta Y}" ; "vector size" ; 5:1000000 +axpy ; "{/*1.5 Y += alpha X}" ; "vector size" ; 5:1000000 +matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:2048 +matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:2048 +trisolve ; "{/*1.5 triangular solver (X = inv(L) X)}" ; "size" ; 4:2048 +cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:2048 +lu_decomp ; "{/*1.5 LU decomposition}" ; "matrix size" ; 4:2048 +tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:2048 +hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:2048 +symv ; "{/*1.5 symmetric matrix vector product}" ; "matrix size" ; 4:2048 +syr2 ; "{/*1.5 symmetric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:2048
\ No newline at end of file diff --git a/bench/btl/data/go_mean b/bench/btl/data/go_mean index f8edf43db..71cca6126 100755 --- a/bench/btl/data/go_mean +++ b/bench/btl/data/go_mean @@ -1,7 +1,20 @@ #! /bin/bash + +if [ $# < 1 ]; then + echo "Usage: $0 working_directory [tiny|large [prefix]]" +else + mkdir -p $1 ##cp ../libs/*/*.dat $1 +mode=large +if [ $# > 2 ]; then + mode=$2 +fi +if [ $# > 3 ]; then + prefix=$3 +fi + EIGENDIR=`cat eigen_root_dir.txt` webpagefilename=$1/index.html @@ -18,19 +31,22 @@ echo '<ul>'\ '</ul>' \ '</p>' >> $webpagefilename -source mk_mean_script.sh axpy $1 11 2500 100000 250000 $2 -source mk_mean_script.sh axpby $1 11 2500 100000 250000 $2 -source mk_mean_script.sh matrix_vector $1 11 50 300 1000 $2 -source mk_mean_script.sh atv $1 11 50 300 1000 $2 -source mk_mean_script.sh matrix_matrix $1 11 100 300 1000 $2 -source mk_mean_script.sh aat $1 11 100 300 1000 $2 -source mk_mean_script.sh ata $1 11 100 300 1000 $2 -source mk_mean_script.sh trisolve $1 11 100 300 1000 $2 -source mk_mean_script.sh cholesky $1 11 100 300 1000 $2 -source mk_mean_script.sh lu_decomp $1 11 100 300 1000 $2 -source mk_mean_script.sh tridiagonalization $1 11 100 300 1000 $2 -source mk_mean_script.sh hessenberg $1 11 100 300 1000 $2 +source mk_mean_script.sh axpy $1 11 2500 100000 250000 $mode $prefix +source mk_mean_script.sh axpby $1 11 2500 100000 250000 $mode $prefix +source mk_mean_script.sh matrix_vector $1 11 50 300 1000 $mode $prefix +source mk_mean_script.sh atv $1 11 50 300 1000 $mode $prefix +source mk_mean_script.sh matrix_matrix $1 11 100 300 1000 $mode $prefix +source mk_mean_script.sh aat $1 11 100 300 1000 $mode $prefix +source mk_mean_script.sh ata $1 11 100 300 1000 $mode $prefix +source mk_mean_script.sh trisolve $1 11 100 300 1000 $mode $prefix +source mk_mean_script.sh cholesky $1 11 100 300 1000 $mode $prefix +source mk_mean_script.sh lu_decomp $1 11 100 300 1000 $mode $prefix +source mk_mean_script.sh tridiagonalization $1 11 100 300 1000 $mode $prefix +source mk_mean_script.sh hessenberg $1 11 100 300 1000 $mode $prefix +source mk_mean_script.sh symv $1 11 50 300 1000 $mode $prefix +source mk_mean_script.sh syr2 $1 11 50 300 1000 $mode $prefix +fi ## compile the web page ## diff --git a/bench/btl/data/mk_mean_script.sh b/bench/btl/data/mk_mean_script.sh index baa0fd9df..43bab559a 100644 --- a/bench/btl/data/mk_mean_script.sh +++ b/bench/btl/data/mk_mean_script.sh @@ -5,6 +5,7 @@ MINIC=$3 MAXIC=$4 MINOC=$5 MAXOC=$6 +prefix=$8 meanstatsfilename=$2/mean.html @@ -37,7 +38,7 @@ echo '<br/>' >> $meanstatsfilename webpagefilename=$2/index.html # echo '<h3>'${WHAT}'</h3>' >> $webpagefilename -echo '<hr/><a href="/btl/'$1'.pdf"><img src="/btl/'$1'.png" alt="'${WHAT}'" /></a><br/>' >> $webpagefilename +echo '<hr/><a href="'$prefix$1'.pdf"><img src="'$prefix$1'.png" alt="'${WHAT}'" /></a><br/>' >> $webpagefilename diff --git a/bench/btl/generic_bench/bench_parameter.hh b/bench/btl/generic_bench/bench_parameter.hh index 1295b374d..08fea80e4 100644 --- a/bench/btl/generic_bench/bench_parameter.hh +++ b/bench/btl/generic_bench/bench_parameter.hh @@ -37,11 +37,11 @@ // min matrix size for matrix matrix product bench #define MIN_MM 5 // max matrix size for matrix matrix product bench -#define MAX_MM 2048 +#define MAX_MM MAX_MV // min matrix size for LU bench #define MIN_LU 5 // max matrix size for LU bench -#define MAX_LU 1024 +#define MAX_LU 2048 // max size for tiny vector and matrix #define TINY_MV_MAX_SIZE 16 // default nb_sample for x86 timer diff --git a/bench/btl/generic_bench/btl.hh b/bench/btl/generic_bench/btl.hh index 38e2c5f45..fdc099296 100644 --- a/bench/btl/generic_bench/btl.hh +++ b/bench/btl/generic_bench/btl.hh @@ -169,7 +169,7 @@ class BtlConfig { public: BtlConfig() - : overwriteResults(false) + : overwriteResults(false), checkResults(true) { char * _config; _config = getenv ("BTL_CONFIG"); @@ -193,6 +193,10 @@ public: { Instance.overwriteResults = true; } + else if (config[i].beginsWith("--nocheck")) + { + Instance.checkResults = false; + } } } @@ -214,6 +218,7 @@ public: static BtlConfig Instance; bool overwriteResults; + bool checkResults; protected: std::vector<BtlString> m_selectedActionNames; diff --git a/bench/btl/generic_bench/timers/portable_perf_analyzer.hh b/bench/btl/generic_bench/timers/portable_perf_analyzer.hh index d0fe95ce0..67d3378fc 100644 --- a/bench/btl/generic_bench/timers/portable_perf_analyzer.hh +++ b/bench/btl/generic_bench/timers/portable_perf_analyzer.hh @@ -65,9 +65,12 @@ public: time_action = time_action / (double(_nb_calc)); // check - action.initialize(); - action.calculate(); - action.check_result(); + if (BtlConfig::Instance.checkResults) + { + action.initialize(); + action.calculate(); + action.check_result(); + } return action.nb_op_base()/(time_action*1000000.0); } diff --git a/bench/btl/libs/C_BLAS/C_BLAS_interface.hh b/bench/btl/libs/C_BLAS/C_BLAS_interface.hh index 319658c6b..a726fa89d 100644 --- a/bench/btl/libs/C_BLAS/C_BLAS_interface.hh +++ b/bench/btl/libs/C_BLAS/C_BLAS_interface.hh @@ -132,7 +132,7 @@ static char notrans = 'N'; static char trans = 'T'; static char nonunit = 'N'; static char lower = 'L'; -static blasint intone = 1; +static int intone = 1; template<> class C_BLAS_interface<float> : public f77_interface_base<float> @@ -160,6 +160,14 @@ public : cblas_ssymv(CblasColMajor,CblasLower,N,1.0,A,N,B,1,0.0,X,1); #endif } + + static inline void syr2(gene_matrix & A, gene_vector & B, gene_vector & X, int N){ + #ifdef PUREBLAS + ssyr2_(&lower,&N,&fone,B,&intone,X,&intone,A,&N); + #else + cblas_ssyr2(CblasColMajor,CblasLower,N,1.0,B,1,X,1,A,N); + #endif + } static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){ #ifdef PUREBLAS diff --git a/bench/btl/libs/C_BLAS/main.cpp b/bench/btl/libs/C_BLAS/main.cpp index 1eee55077..57cb9930e 100644 --- a/bench/btl/libs/C_BLAS/main.cpp +++ b/bench/btl/libs/C_BLAS/main.cpp @@ -41,6 +41,7 @@ int main() bench<Action_matrix_vector_product<C_BLAS_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); bench<Action_atv_product<C_BLAS_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); bench<Action_symv<C_BLAS_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); + bench<Action_syr2<C_BLAS_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); bench<Action_matrix_matrix_product<C_BLAS_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); bench<Action_ata_product<C_BLAS_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); diff --git a/bench/btl/libs/STL/STL_interface.hh b/bench/btl/libs/STL/STL_interface.hh index 9dd9b8ed4..3958d4af5 100644 --- a/bench/btl/libs/STL/STL_interface.hh +++ b/bench/btl/libs/STL/STL_interface.hh @@ -146,6 +146,15 @@ public : X[j] += t2; } } + + static inline void syr2(gene_matrix & A, gene_vector & B, gene_vector & X, int N) + { + for (int j=0; j<N; ++j) + { + for (int i=j; i<N; ++i) + A[j][i] += B[i]*X[j] + B[j]*X[i]; + } + } static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N) { diff --git a/bench/btl/libs/STL/main.cpp b/bench/btl/libs/STL/main.cpp index 619504e47..4e73328ef 100644 --- a/bench/btl/libs/STL/main.cpp +++ b/bench/btl/libs/STL/main.cpp @@ -31,6 +31,7 @@ int main() bench<Action_matrix_vector_product<STL_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); bench<Action_atv_product<STL_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); bench<Action_symv<STL_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); + bench<Action_syr2<STL_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); bench<Action_matrix_matrix_product<STL_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); bench<Action_ata_product<STL_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); bench<Action_aat_product<STL_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); diff --git a/bench/btl/libs/eigen2/CMakeLists.txt b/bench/btl/libs/eigen2/CMakeLists.txt index f061a27de..beb37f79b 100644 --- a/bench/btl/libs/eigen2/CMakeLists.txt +++ b/bench/btl/libs/eigen2/CMakeLists.txt @@ -7,7 +7,7 @@ if (EIGEN2_FOUND) btl_add_bench(btl_eigen2_vecmat main_vecmat.cpp) btl_add_bench(btl_eigen2_matmat main_matmat.cpp) btl_add_bench(btl_eigen2_adv main_adv.cpp) - + IF(NOT BTL_NOVEC) btl_add_bench(btl_eigen2_novec_linear main_linear.cpp) btl_add_bench(btl_eigen2_novec_vecmat main_vecmat.cpp) diff --git a/bench/btl/libs/eigen2/eigen2_interface.hh b/bench/btl/libs/eigen2/eigen2_interface.hh index 2b463f017..92a5677d3 100644 --- a/bench/btl/libs/eigen2/eigen2_interface.hh +++ b/bench/btl/libs/eigen2/eigen2_interface.hh @@ -18,7 +18,7 @@ #ifndef EIGEN2_INTERFACE_HH #define EIGEN2_INTERFACE_HH // #include <cblas.h> -#include <Eigen/Core> +#include <Eigen/Array> #include <Eigen/Cholesky> #include <Eigen/LU> #include <Eigen/QR> @@ -45,7 +45,9 @@ public : static inline std::string name( void ) { - #if defined(EIGEN_VECTORIZE_SSE) + #if defined(EIGEN_USE_NEW_PRODUCT) + if (SIZE==Dynamic) return "eigen2_newprod"; else return "tiny_eigen2"; + #elif defined(EIGEN_VECTORIZE_SSE) if (SIZE==Dynamic) return "eigen2"; else return "tiny_eigen2"; #elif defined(EIGEN_VECTORIZE_ALTIVEC) if (SIZE==Dynamic) return "eigen2"; else return "tiny_eigen2"; @@ -114,7 +116,57 @@ public : } static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){ - X = (A.template marked<SelfAdjoint|LowerTriangular>() * B)/*.lazy()*/; + //X = (A.template marked<SelfAdjoint|LowerTriangular>() * B)/*.lazy()*/; + ei_product_selfadjoint_vector<real,0,LowerTriangularBit>(N,A.data(),N, B.data(), X.data()); + } + + template<typename Dest, typename Src> static void triassign(Dest& dst, const Src& src) + { + typedef typename Dest::Scalar Scalar; + typedef typename ei_packet_traits<Scalar>::type Packet; + const int PacketSize = sizeof(Packet)/sizeof(Scalar); + int size = dst.cols(); + for(int j=0; j<size; j+=1) + { +// const int alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask); + Scalar* A0 = dst.data() + j*dst.stride(); + int starti = j; + int alignedEnd = starti; + int alignedStart = (starti) + ei_alignmentOffset(&A0[starti], size-starti); + alignedEnd = alignedStart + ((size-alignedStart)/(2*PacketSize))*(PacketSize*2); + + // do the non-vectorizable part of the assignment + for (int index = starti; index<alignedStart ; ++index) + { + if(Dest::Flags&RowMajorBit) + dst.copyCoeff(j, index, src); + else + dst.copyCoeff(index, j, src); + } + + // do the vectorizable part of the assignment + for (int index = alignedStart; index<alignedEnd; index+=PacketSize) + { + if(Dest::Flags&RowMajorBit) + dst.template copyPacket<Src, Aligned, Unaligned>(j, index, src); + else + dst.template copyPacket<Src, Aligned, Unaligned>(index, j, src); + } + + // do the non-vectorizable part of the assignment + for (int index = alignedEnd; index<size; ++index) + { + if(Dest::Flags&RowMajorBit) + dst.copyCoeff(j, index, src); + else + dst.copyCoeff(index, j, src); + } + //dst.col(j).end(N-j) = src.col(j).end(N-j); + } + } + + static EIGEN_DONT_INLINE void syr2(gene_matrix & A, gene_vector & X, gene_vector & Y, int N){ + // ei_product_selfadjoint_rank2_update<real,0,LowerTriangularBit>(N,A.data(),N, X.data(), 1, Y.data(), 1, -1); } static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){ @@ -126,7 +178,9 @@ public : } static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int N){ + asm("#begin axpby"); Y = a*X + b*Y; + asm("#end axpby"); } static inline void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N){ @@ -158,7 +212,10 @@ public : } static inline void tridiagonalization(const gene_matrix & X, gene_matrix & C, int N){ - C = Tridiagonalization<gene_matrix>(X).packedMatrix(); + typename Tridiagonalization<gene_matrix>::CoeffVectorType aux(N-1); + C = X; + Tridiagonalization<gene_matrix>::_compute(C, aux); +// C = Tridiagonalization<gene_matrix>(X).packedMatrix(); } static inline void hessenberg(const gene_matrix & X, gene_matrix & C, int N){ diff --git a/bench/btl/libs/eigen2/main_vecmat.cpp b/bench/btl/libs/eigen2/main_vecmat.cpp index 881d90e2a..fb00d6f79 100644 --- a/bench/btl/libs/eigen2/main_vecmat.cpp +++ b/bench/btl/libs/eigen2/main_vecmat.cpp @@ -19,7 +19,6 @@ #include "eigen2_interface.hh" #include "bench.hh" #include "basic_actions.hh" -#include "action_symv.hh" BTL_MAIN; @@ -28,6 +27,7 @@ int main() bench<Action_matrix_vector_product<eigen2_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); bench<Action_atv_product<eigen2_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); bench<Action_symv<eigen2_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); + bench<Action_syr2<eigen2_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); return 0; } diff --git a/bench/btl/libs/hand_vec/hand_vec_interface.hh b/bench/btl/libs/hand_vec/hand_vec_interface.hh index 4e7d549ce..6080b2460 100755 --- a/bench/btl/libs/hand_vec/hand_vec_interface.hh +++ b/bench/btl/libs/hand_vec/hand_vec_interface.hh @@ -38,16 +38,16 @@ public : typedef typename f77_interface_base<real>::gene_vector gene_vector; static void free_matrix(gene_matrix & A, int N){ - ei_aligned_delete(A); + ei_aligned_free(A); } static void free_vector(gene_vector & B){ - ei_aligned_delete(B); + ei_aligned_free(B); } static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ int N = A_stl.size(); - A = ei_aligned_new<real>(N*N); + A = (real*)ei_aligned_malloc(N*N*sizeof(real)); for (int j=0;j<N;j++) for (int i=0;i<N;i++) A[i+N*j] = A_stl[j][i]; @@ -55,7 +55,7 @@ public : static inline void vector_from_stl(gene_vector & B, stl_vector & B_stl){ int N = B_stl.size(); - B = ei_aligned_new<real>(N); + B = (real*)ei_aligned_malloc(N*sizeof(real)); for (int i=0;i<N;i++) B[i] = B_stl[i]; } @@ -236,6 +236,129 @@ public : } asm("#end matrix_vector_product"); } + + static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) + { + +// int AN = (N/PacketSize)*PacketSize; +// int ANP = (AN/(2*PacketSize))*2*PacketSize; +// int bound = (N/4)*4; + for (int i=0;i<N;i++) + X[i] = 0; + + int bound = std::max(0,N-8) & 0xfffffffE; + + for (int j=0;j<bound;j+=2) + { + register real* __restrict__ A0 = A + j*N; + register real* __restrict__ A1 = A + (j+1)*N; + + real t0 = B[j]; + Packet ptmp0 = ei_pset1(t0); + real t1 = B[j+1]; + Packet ptmp1 = ei_pset1(t1); + + real t2 = 0; + Packet ptmp2 = ei_pset1(t2); + real t3 = 0; + Packet ptmp3 = ei_pset1(t3); + + int starti = j+2; + int alignedEnd = starti; + int alignedStart = (starti) + ei_alignmentOffset(&X[starti], N-starti); + alignedEnd = alignedStart + ((N-alignedStart)/(PacketSize))*(PacketSize); + + X[j] += t0 * A0[j]; + X[j+1] += t1 * A1[j]; + + X[j+1] += t0 * A0[j+1]; + t2 += A0[j+1] * B[j+1]; + +// alignedStart = alignedEnd; + for (int i=starti; i<alignedStart; ++i) { + X[i] += t0 * A0[i] + t1 * A1[i]; + t2 += A0[i] * B[i]; + t3 += A1[i] * B[i]; + } + asm("#begin symv"); + for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize) { + Packet A0i = ei_ploadu(&A0[i]); + Packet A1i = ei_ploadu(&A1[i]); +// Packet A0i1 = ei_ploadu(&A0[i+PacketSize]); + Packet Xi = ei_pload(&X[i]); + Packet Bi = ei_pload/*u*/(&B[i]); +// Packet Xi1 = ei_pload(&X[i+PacketSize]); +// Packet Bi1 = ei_pload/*u*/(&B[i+PacketSize]); + Xi = ei_padd(ei_padd(Xi, ei_pmul(ptmp0, A0i)), ei_pmul(ptmp1, A1i)); + ptmp2 = ei_padd(ptmp2, ei_pmul(A0i, Bi)); + ptmp3 = ei_padd(ptmp3, ei_pmul(A1i, Bi)); +// Xi1 = ei_padd(Xi1, ei_pmul(ptmp1, A0i1)); +// ptmp2 = ei_padd(ptmp2, ei_pmul(A0i1, Bi1)); +// + ei_pstore(&X[i],Xi); +// ei_pstore(&X[i+PacketSize],Xi1); +// asm( +// "prefetchnta 64(%[A0],%[i],4) \n\t" +// //"movups (%[A0],%[i],4), %%xmm8 \n\t" +// "movsd (%[A0],%[i],4), %%xmm8 \n\t" +// "movhps 8(%[A0],%[i],4), %%xmm8 \n\t" +// // "movups 16(%[A0],%[i],4), %%xmm9 \n\t" +// // "movups 64(%[A0],%[i],4), %%xmm15 \n\t" +// "movaps (%[B], %[i],4), %%xmm12 \n\t" +// // "movaps 16(%[B], %[i],4), %%xmm13 \n\t" +// "movaps (%[X], %[i],4), %%xmm10 \n\t" +// // "movaps 16(%[X], %[i],4), %%xmm11 \n\t" +// +// "mulps %%xmm8, %%xmm12 \n\t" +// // "mulps %%xmm9, %%xmm13 \n\t" +// +// "mulps %[ptmp1], %%xmm8 \n\t" +// "addps %%xmm12, %[ptmp2] \n\t" +// "addps %%xmm8, %%xmm10 \n\t" +// +// +// +// +// // "mulps %[ptmp1], %%xmm9 \n\t" +// +// // "addps %%xmm9, %%xmm11 \n\t" +// // "addps %%xmm13, %[ptmp2] \n\t" +// +// "movaps %%xmm10, (%[X],%[i],4) \n\t" +// // "movaps %%xmm11, 16(%[X],%[i],4) \n\t" +// : +// : [X] "r" (X), [i] "r" (i), [A0] "r" (A0), +// [B] "r" (B), +// [ptmp1] "x" (ptmp1), +// [ptmp2] "x" (ptmp2) +// : "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm15"); + } + asm("#end symv"); + for (int i=alignedEnd; i<N; i++) { + X[i] += t0 * A0[i] + t1 * A1[i]; + t2 += A0[i] * B[i]; + t3 += A1[i] * B[i]; + } + + + X[j] += t2 + ei_predux(ptmp2); + X[j+1] += t3 + ei_predux(ptmp3); + } + for (int j=bound;j<N;j++) + { + register real* __restrict__ A0 = A + j*N; + + real t1 = B[j]; + real t2 = 0; + X[j] += t1 * A0[j]; + for (int i=j+1; i<N; i+=PacketSize) { + X[i] += t1 * A0[i]; + t2 += A0[i] * B[i]; + } + X[j] += t2; + } + + } // static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) // { diff --git a/bench/btl/libs/hand_vec/main.cpp b/bench/btl/libs/hand_vec/main.cpp index eea8cb88b..8bde4c89b 100644 --- a/bench/btl/libs/hand_vec/main.cpp +++ b/bench/btl/libs/hand_vec/main.cpp @@ -26,6 +26,7 @@ #include "action_axpy.hh" #include "action_ata_product.hh" #include "action_aat_product.hh" +#include "basic_actions.hh" //#include "action_lu_solve.hh" // #include "timers/mixed_perf_analyzer.hh" @@ -39,6 +40,7 @@ int main() // bench<Action_matrix_matrix_product<hand_vec_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); // bench<Action_aat_product<hand_vec_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); // bench<Action_ata_product<hand_vec_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); + bench<Action_symv<hand_vec_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); bench<Action_axpy<hand_vec_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT); |