diff options
author | Gael Guennebaud <g.gael@free.fr> | 2011-03-23 10:07:24 +0100 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2011-03-23 10:07:24 +0100 |
commit | e35b1ef3f39bac2fc4391a8326b50602425fb588 (patch) | |
tree | f7598761a1506ea8be009532a3dc92b1bc676611 | |
parent | fe595e91ae9a2d8544cae167775d7a076fcaf27c (diff) |
BTL: rm stupid backends
27 files changed, 7 insertions, 1882 deletions
diff --git a/bench/btl/CMakeLists.txt b/bench/btl/CMakeLists.txt index 95ffbf42a..2159415f0 100644 --- a/bench/btl/CMakeLists.txt +++ b/bench/btl/CMakeLists.txt @@ -90,17 +90,13 @@ endmacro(btl_add_target_property) ENABLE_TESTING() add_subdirectory(libs/eigen3) -# add_subdirectory(libs/hand_vec) +add_subdirectory(libs/C_BLAS) +add_subdirectory(libs/ublas) add_subdirectory(libs/gmm) add_subdirectory(libs/mtl4) -add_subdirectory(libs/ublas) add_subdirectory(libs/blitz) add_subdirectory(libs/tvmet) -add_subdirectory(libs/C_BLAS) -add_subdirectory(libs/f77) -add_subdirectory(libs/C) add_subdirectory(libs/STL) -add_subdirectory(libs/STL_algo) add_subdirectory(data) diff --git a/bench/btl/actions/action_hessenberg.hh b/bench/btl/actions/action_hessenberg.hh index a46964657..5dd12a9bc 100644 --- a/bench/btl/actions/action_hessenberg.hh +++ b/bench/btl/actions/action_hessenberg.hh @@ -136,13 +136,13 @@ public : Action_tridiagonalization( int size ):_size(size) { - MESSAGE("Action_hessenberg Ctor"); + MESSAGE("Action_tridiagonalization Ctor"); // STL vector initialization typename Interface::stl_matrix tmp; init_matrix<pseudo_random>(tmp,_size); init_matrix<null_function>(X_stl,_size); - STL_interface<typename Interface::real_type>::ata_product(tmp,X_stl,_size); + STL_interface<typename Interface::real_type>::aat_product(tmp,X_stl,_size); init_matrix<null_function>(C_stl,_size); init_matrix<null_function>(resu_stl,_size); @@ -155,9 +155,9 @@ public : _cost = 0; for (int j=0; j<_size-2; ++j) { - int r = std::max(0,_size-j-1); - int b = std::max(0,_size-j-2); - _cost += 6 + 3*b + r*r*8; + double r = std::max(0,_size-j-1); + double b = std::max(0,_size-j-2); + _cost += 6. + 3.*b + r*r*8.; } } diff --git a/bench/btl/libs/C/CMakeLists.txt b/bench/btl/libs/C/CMakeLists.txt deleted file mode 100644 index 3d4d24cee..000000000 --- a/bench/btl/libs/C/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -include_directories(${PROJECT_SOURCE_DIR}/libs/f77) -btl_add_bench(btl_C main.cpp OFF) -# set_target_properties(btl_C PROPERTIES COMPILE_FLAGS "-fpeel-loops")
\ No newline at end of file diff --git a/bench/btl/libs/C/C_interface.hh b/bench/btl/libs/C/C_interface.hh deleted file mode 100755 index d6092517d..000000000 --- a/bench/btl/libs/C/C_interface.hh +++ /dev/null @@ -1,117 +0,0 @@ -//===================================================== -// File : C_interface.hh -// Author : L. Plagne <laurent.plagne@edf.fr)> -// Copyright (C) EDF R&D, lun sep 30 14:23:23 CEST 2002 -//===================================================== -// -// This program is free software; you can redistribute it and/or -// modify it under the terms of the GNU General Public License -// as published by the Free Software Foundation; either version 2 -// of the License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -// -#ifndef C_INTERFACE_HH -#define C_INTERFACE_HH - -#include "f77_interface.hh" - -template<class real> -class C_interface : public f77_interface_base<real> { - -public : - - typedef typename f77_interface_base<real>::gene_matrix gene_matrix; - typedef typename f77_interface_base<real>::gene_vector gene_vector; - - static inline std::string name() { return "C"; } - - static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) - { -// for (int i=0;i<N;i++) -// { -// real somme = 0.0; -// for (int j=0;j<N;j++) -// somme += A[j*N+i] * B[j]; -// X[i] = somme; -// } - for (int i=0;i<N;i++) - X[i] = 0; - for (int i=0;i<N;i++) - { - real tmp = B[i]; - int iN = i*N; - for (int j=0;j<N;j++) - X[j] += tmp * A[j+iN]; - } - } - - static inline void atv_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) - { - for (int i=0;i<N;i++) - { - int iN = i*N; - real somme = 0.0; - for (int j=0;j<N;j++) - somme += A[iN+j] * B[j]; - X[i] = somme; - } - } - - static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N) - { - real somme; - for (int i=0;i<N;i++){ - for (int j=0;j<N;j++){ - somme=0.0; - for (int k=0;k<N;k++){ - somme += A[i+k*N] * B[k+j*N]; - } - X[i+j*N] = somme; - } - } - } - - static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N) - { - - real somme; - for (int i=0;i<N;i++){ - for (int j=0;j<N;j++){ - somme=0.0; - for (int k=0;k<N;k++){ - somme+=A[k+i*N]*A[k+j*N]; - } - X[i+j*N]=somme; - } - } - } - - static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){ - real somme; - for (int i=0;i<N;i++){ - for (int j=0;j<N;j++){ - somme=0.0; - for (int k=0;k<N;k++){ - somme+=A[i+k*N]*A[j+k*N]; - } - X[i+j*N] = somme; - } - } - } - - static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N){ - for (int i=0;i<N;i++) - Y[i]+=coef*X[i]; - } - - -}; - -#endif diff --git a/bench/btl/libs/C/main.cpp b/bench/btl/libs/C/main.cpp deleted file mode 100644 index f0a0e5c65..000000000 --- a/bench/btl/libs/C/main.cpp +++ /dev/null @@ -1,48 +0,0 @@ -//===================================================== -// File : main.cpp -// Author : L. Plagne <laurent.plagne@edf.fr)> -// Copyright (C) EDF R&D, lun sep 30 14:23:23 CEST 2002 -//===================================================== -// -// This program is free software; you can redistribute it and/or -// modify it under the terms of the GNU General Public License -// as published by the Free Software Foundation; either version 2 -// of the License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -// -#include "utilities.h" -#include "bench.hh" -#include "C_interface.hh" -#include "action_matrix_vector_product.hh" -#include "action_atv_product.hh" -#include "action_matrix_matrix_product.hh" -#include "action_axpy.hh" -#include "action_ata_product.hh" -#include "action_aat_product.hh" -//#include "action_lu_solve.hh" -#include "timers/mixed_perf_analyzer.hh" - -BTL_MAIN; - -int main() -{ - - bench<Action_matrix_vector_product<C_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); - bench<Action_atv_product<C_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); - bench<Action_matrix_matrix_product<C_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); - bench<Action_aat_product<C_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); - bench<Action_ata_product<C_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); - bench<Action_axpy<C_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT); - - - return 0; -} - - diff --git a/bench/btl/libs/STL_algo/CMakeLists.txt b/bench/btl/libs/STL_algo/CMakeLists.txt deleted file mode 100644 index 8309c8bdd..000000000 --- a/bench/btl/libs/STL_algo/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ - -btl_add_bench(btl_STL_algo main.cpp OFF) diff --git a/bench/btl/libs/STL_algo/STL_algo_interface.hh b/bench/btl/libs/STL_algo/STL_algo_interface.hh deleted file mode 100644 index 8854479a6..000000000 --- a/bench/btl/libs/STL_algo/STL_algo_interface.hh +++ /dev/null @@ -1,138 +0,0 @@ -//===================================================== -// File : STL_algo_interface.hh -// Author : L. Plagne <laurent.plagne@edf.fr)> -// Copyright (C) EDF R&D, lun sep 30 14:23:24 CEST 2002 -//===================================================== -// -// This program is free software; you can redistribute it and/or -// modify it under the terms of the GNU General Public License -// as published by the Free Software Foundation; either version 2 -// of the License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -// -#ifndef STL_ALGO_INTERFACE_HH -#define STL_ALGO_INTERFACE_HH -#include <string> -#include <vector> -#include <numeric> -#include <algorithm> -#include "utilities.h" - - -template<class real> -class STL_algo_interface{ - -public : - - typedef real real_type ; - - typedef std::vector<real> stl_vector; - typedef std::vector<stl_vector > stl_matrix; - - typedef stl_matrix gene_matrix; - - typedef stl_vector gene_vector; - - static inline std::string name( void ) - { - return "STL_algo"; - } - - static void free_matrix(gene_matrix & A, int N){} - - static void free_vector(gene_vector & B){} - - static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ - A=A_stl ; - } - - static inline void vector_from_stl(gene_vector & B, stl_vector & B_stl){ - B=B_stl ; - } - - static inline void vector_to_stl(gene_vector & B, stl_vector & B_stl){ - B_stl=B ; - } - - static inline void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){ - A_stl=A ; - } - - static inline void copy_vector(const gene_vector & source, gene_vector & cible, int N){ - for (int i=0;i<N;i++) - cible[i]=source[i]; - } - - static inline void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N) - { - for (int i=0;i<N;i++){ - for (int j=0;j<N;j++){ - cible[i][j]=source[i][j]; - } - } - } - - class somme { - public: - - somme(real coef):_coef(coef){}; - - real operator()(const real & val1, const real & val2) - { - return _coef * val1 + val2; - } - - private: - - real _coef; - - }; - - - class vector_generator { - public: - - vector_generator(const gene_matrix & a_matrix, const gene_vector & a_vector): - _matrice(a_matrix), - _vecteur(a_vector), - _index(0) - {}; - real operator()( void ) - { - - const gene_vector & ai=_matrice[_index]; - int N=ai.size(); - - _index++; - - return std::inner_product(&ai[0],&ai[N],&_vecteur[0],0.0); - } - - private: - - int _index; - const gene_matrix & _matrice; - const gene_vector & _vecteur; - - }; - - static inline void atv_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) - { - std::generate(&X[0],&X[N],vector_generator(A,B)); - } - - static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N) - { - std::transform(&X[0],&X[N],&Y[0],&Y[0],somme(coef)); - } - -}; - -#endif diff --git a/bench/btl/libs/STL_algo/main.cpp b/bench/btl/libs/STL_algo/main.cpp deleted file mode 100644 index 9ce2d947e..000000000 --- a/bench/btl/libs/STL_algo/main.cpp +++ /dev/null @@ -1,39 +0,0 @@ -//===================================================== -// File : main.cpp -// Author : L. Plagne <laurent.plagne@edf.fr)> -// Copyright (C) EDF R&D, lun sep 30 14:23:23 CEST 2002 -//===================================================== -// -// This program is free software; you can redistribute it and/or -// modify it under the terms of the GNU General Public License -// as published by the Free Software Foundation; either version 2 -// of the License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -// -#include "utilities.h" -#include "STL_algo_interface.hh" -#include "bench.hh" -#include "action_atv_product.hh" -#include "action_axpy.hh" - -BTL_MAIN; - -int main() -{ - - - bench<Action_atv_product<STL_algo_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); - - bench<Action_axpy<STL_algo_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT); - - return 0; -} - - diff --git a/bench/btl/libs/f77/CMakeLists.txt b/bench/btl/libs/f77/CMakeLists.txt deleted file mode 100644 index cecb9160c..000000000 --- a/bench/btl/libs/f77/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -if(CMAKE_MINOR_VERSION GREATER 4) - if(NOT MSVC) - enable_language(Fortran) - endif(NOT MSVC) - btl_add_bench(btl_f77 main.cpp dmxv.f smxv.f dmxm.f smxm.f daxpy.f saxpy.f data.f sata.f daat.f saat.f OFF) -endif(CMAKE_MINOR_VERSION GREATER 4)
\ No newline at end of file diff --git a/bench/btl/libs/f77/daat.f b/bench/btl/libs/f77/daat.f deleted file mode 100644 index a50329a66..000000000 --- a/bench/btl/libs/f77/daat.f +++ /dev/null @@ -1,14 +0,0 @@ - SUBROUTINE DAAT(A,X,N) -** -** X = AT * A - REAL*8 A(N,N),X(N,N),R - DO 20 I=1,N - DO 20 J=1,N - R=0. - DO 10 K=1,N - R=R+A(I,K)*A(J,K) - 10 CONTINUE - X(I,J)=R - 20 CONTINUE - RETURN - END diff --git a/bench/btl/libs/f77/data.f b/bench/btl/libs/f77/data.f deleted file mode 100644 index 709211ca5..000000000 --- a/bench/btl/libs/f77/data.f +++ /dev/null @@ -1,14 +0,0 @@ - SUBROUTINE DATA(A,X,N) -** -** X = AT * A - REAL*8 A(N,N),X(N,N),R - DO 20 I=1,N - DO 20 J=1,N - R=0. - DO 10 K=1,N - R=R+A(K,I)*A(K,J) - 10 CONTINUE - X(I,J)=R - 20 CONTINUE - RETURN - END diff --git a/bench/btl/libs/f77/daxpy.f b/bench/btl/libs/f77/daxpy.f deleted file mode 100644 index 29514a222..000000000 --- a/bench/btl/libs/f77/daxpy.f +++ /dev/null @@ -1,18 +0,0 @@ - SUBROUTINE DAXPYF(N,A,X,Y) -** *************************************** -** CALCULE Y = Y + A*X -** *************************************** -*>N NOMBRE D'OPERATIONS A FAIRE -*>A CONSTANTE MULTIPLICATIVE -*>X TABLEAU -*=Y TABLEAU DES RESULTATS -*A R. SANCHEZ ( EARLY WINTER 1987 ) -*V M.F. ROBEAU - REAL*8 X(1),Y(1) - REAL*8 A - DO 10 I=1,N - Y(I)=Y(I)+A*X(I) - 10 CONTINUE - RETURN - END - diff --git a/bench/btl/libs/f77/dmxm.f b/bench/btl/libs/f77/dmxm.f deleted file mode 100644 index eb7ef9006..000000000 --- a/bench/btl/libs/f77/dmxm.f +++ /dev/null @@ -1,32 +0,0 @@ - SUBROUTINE DMXM(A,N,B,M,C,L) -** -** C = A * B -** A ET B MATRICES A(N,M) B(M,L) ==> C(N,L) -** -*>A PREMIERE MATRICE -*>N PREMIERE DIMENSION DE A ET DE C -*>B DEUXIEME MATRICE -*>M DEUXIEME DIMENSION DE A ET PERMIERE DE B -*<C MATRICE PRODUIT DE A ET DE B -*>L DEUXIEME DIMENSION DE B ET DE C -*A R. SANCHEZ ( EARLY WINTER 1987 ) -*V M.F. ROBEAU -*M AM BAUDRON - AVRIL 94 -*: ERREUR DANS L'APPEL A L'UTILITAIRE SGEMM -*: APPEL A L'UTILITAIRE SGEMM DE LA LIBRAIRIE BLAS SUR HP -*M AM BAUDRON - NOVEMBRE 1991 -*: ERREUR ( SOMME SUR LES TERMES PAS FAITE ) -*: APPEL A L'UTILITAIRE SGEMM DE LA LIBRAIRIE BLAS SUR RISC -*M AM BAUDRON - MAI 1993 -*: CHANGEMENT DES %IF LOCAL SUN MIPS SUITE A INTRODUCTION VERSION IBM - REAL*8 A(N,M),B(M,L),C(N,L),R - DO 20 I=1,N - DO 20 J=1,L - R=0. - DO 10 K=1,M - R=R+A(I,K)*B(K,J) - 10 CONTINUE - C(I,J)=R - 20 CONTINUE - RETURN - END diff --git a/bench/btl/libs/f77/dmxm.f.mfr b/bench/btl/libs/f77/dmxm.f.mfr deleted file mode 100644 index 82ccac9a5..000000000 --- a/bench/btl/libs/f77/dmxm.f.mfr +++ /dev/null @@ -1,36 +0,0 @@ - - SUBROUTINE DMXM(A,N,B,M,C,L) -** -** C = A * B -** A ET B MATRICES A(N,M) B(M,L) ==> C(N,L) -** -*>A PREMIERE MATRICE -*>N PREMIERE DIMENSION DE A ET DE C -*>B DEUXIEME MATRICE -*>M DEUXIEME DIMENSION DE A ET PERMIERE DE B -*<C MATRICE PRODUIT DE A ET DE B -*>L DEUXIEME DIMENSION DE B ET DE C -*A R. SANCHEZ ( EARLY WINTER 1987 ) -*V M.F. ROBEAU -*M AM BAUDRON - AVRIL 94 -*: ERREUR DANS L'APPEL A L'UTILITAIRE SGEMM -*: APPEL A L'UTILITAIRE SGEMM DE LA LIBRAIRIE BLAS SUR HP -*M AM BAUDRON - NOVEMBRE 1991 -*: ERREUR ( SOMME SUR LES TERMES PAS FAITE ) -*: APPEL A L'UTILITAIRE SGEMM DE LA LIBRAIRIE BLAS SUR RISC -*M AM BAUDRON - MAI 1993 -*: CHANGEMENT DES %IF LOCAL SUN MIPS SUITE A INTRODUCTION VERSION IBM - REAL*8 A(N,M),B(M,L),C(N,L),R - DO 5 J=1,L - DO 5 I=1,N - C(I,J)=0. - 5 CONTINUE - DO 10 K=1,M - DO 20 J=1,L - R=B(K,J) - DO 20 I=1,N - C(I,J)=C(I,J)+A(I,K)*R - 20 CONTINUE - 10 CONTINUE - RETURN - END diff --git a/bench/btl/libs/f77/dmxv.f b/bench/btl/libs/f77/dmxv.f deleted file mode 100644 index bd7e4d550..000000000 --- a/bench/btl/libs/f77/dmxv.f +++ /dev/null @@ -1,39 +0,0 @@ - SUBROUTINE DMXV(A,N,X,M,R) -C -** -** VERSION DOUBLE PRECISION DE MXV -** R = A * X -** A MATRICE A(N,M) -** R ET X VECTEURS -** -*>A PREMIERE MATRICE -*>N PREMIERE DIMENSION DE A -*>X VECTEUR -*>M DEUXIEME DIMENSION DE A -*<R VECTEUR PRODUIT DE A ET DE X -** -*A M. COSTE -*V M.F. ROBEAU -*M -* - REAL*8 X(1),R(1),A(N,M) - REAL*8 S -C DO 20 I=1,N -C S=0. -C DO 10 J=1,M -C S=S+A(I,J)*X(J) -C 10 CONTINUE -C R(I)=S -C 20 CONTINUE - DO 5 I=1,N - R(I)=0 - 5 CONTINUE - DO 10 J=1,M - S=X(J) - DO 20 I=1,N - R(I)=R(I)+A(I,J)*S - 20 CONTINUE - 10 CONTINUE - RETURN - END - diff --git a/bench/btl/libs/f77/f77_interface.hh b/bench/btl/libs/f77/f77_interface.hh deleted file mode 100644 index 8267dbd42..000000000 --- a/bench/btl/libs/f77/f77_interface.hh +++ /dev/null @@ -1,129 +0,0 @@ -//===================================================== -// File : f77_interface.hh -// Author : L. Plagne <laurent.plagne@edf.fr)> -// Copyright (C) EDF R&D, lun sep 30 14:23:24 CEST 2002 -//===================================================== -// -// This program is free software; you can redistribute it and/or -// modify it under the terms of the GNU General Public License -// as published by the Free Software Foundation; either version 2 -// of the License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -// -#ifndef F77_INTERFACE_HH -#define F77_INTERFACE_HH -#include "f77_interface_base.hh" -#include <string> - -extern "C" -{ - void dmxv_(double * A, int * N, double * X, int * M, double *R); - void smxv_(float * A, int * N, float * X, int * M, float *R); - - void dmxm_(double * A, int * N, double * B, int * M, double *C, int * K); - void smxm_(float * A, int * N, float * B, int * M, float *C, int * K); - - void data_(double * A, double *X, int * N); - void sata_(float * A, float *X, int * N); - - void daat_(double * A, double *X, int * N); - void saat_(float * A, float *X, int * N); - - void saxpyf_(int * N, float * coef, float * X, float *Y); - void daxpyf_(int * N, double * coef, double * X, double *Y); -} - -template<class real> -class f77_interface : public f77_interface_base<real> -{ -public : - - typedef typename f77_interface_base<real>::gene_matrix gene_matrix; - typedef typename f77_interface_base<real>::gene_vector gene_vector; - - static inline std::string name( void ) - { - return "f77"; - } - - static inline void matrix_vector_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N) - { - dmxv_(A,&N,B,&N,X); - } - - static inline void matrix_matrix_product(gene_matrix & A, gene_matrix & B, gene_matrix & X, int N) - { - dmxm_(A,&N,B,&N,X,&N); - } - - static inline void ata_product(gene_matrix & A, gene_matrix & X, int N) - { - data_(A,X,&N); - } - - static inline void aat_product(gene_matrix & A, gene_matrix & X, int N) - { - daat_(A,X,&N); - } - - static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N) - { - int one=1; - daxpyf_(&N,&coef,X,Y); - } - - -}; - - -template<> -class f77_interface<float> : public f77_interface_base<float> -{ -public : - - static inline std::string name( void ) - { - return "F77"; - } - - - static inline void matrix_vector_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N) - { - smxv_(A,&N,B,&N,X); - } - - static inline void matrix_matrix_product(gene_matrix & A, gene_matrix & B, gene_matrix & X, int N) - { - smxm_(A,&N,B,&N,X,&N); - } - - static inline void ata_product(gene_matrix & A, gene_matrix & X, int N) - { - sata_(A,X,&N); - } - - static inline void aat_product(gene_matrix & A, gene_matrix & X, int N) - { - saat_(A,X,&N); - } - - - static inline void axpy(float coef, const gene_vector & X, gene_vector & Y, int N) - { - saxpyf_(&N,&coef,X,Y); - } - -}; - - -#endif - - - diff --git a/bench/btl/libs/f77/f77_interface_base.hh b/bench/btl/libs/f77/f77_interface_base.hh deleted file mode 100644 index ab8a18295..000000000 --- a/bench/btl/libs/f77/f77_interface_base.hh +++ /dev/null @@ -1,91 +0,0 @@ -//===================================================== -// File : f77_interface_base.hh -// Author : L. Plagne <laurent.plagne@edf.fr)> -// Copyright (C) EDF R&D, lun sep 30 14:23:25 CEST 2002 -//===================================================== -// -// This program is free software; you can redistribute it and/or -// modify it under the terms of the GNU General Public License -// as published by the Free Software Foundation; either version 2 -// of the License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -// -#ifndef F77_INTERFACE_BASE_HH -#define F77_INTERFACE_BASE_HH - -#include "utilities.h" -#include <vector> -template<class real> -class f77_interface_base{ - -public: - - typedef real real_type ; - typedef std::vector<real> stl_vector; - typedef std::vector<stl_vector > stl_matrix; - - typedef real * gene_matrix; - typedef real * gene_vector; - - static void free_matrix(gene_matrix & A, int N){ - delete A; - } - - static void free_vector(gene_vector & B){ - delete B; - } - - static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ - int N = A_stl.size(); - A = new real[N*N]; - for (int j=0;j<N;j++) - for (int i=0;i<N;i++) - A[i+N*j] = A_stl[j][i]; - } - - static inline void vector_from_stl(gene_vector & B, stl_vector & B_stl){ - int N = B_stl.size(); - B = new real[N]; - for (int i=0;i<N;i++) - B[i] = B_stl[i]; - } - - static inline void vector_to_stl(gene_vector & B, stl_vector & B_stl){ - int N = B_stl.size(); - for (int i=0;i<N;i++) - B_stl[i] = B[i]; - } - - static inline void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){ - int N = A_stl.size(); - for (int j=0;j<N;j++){ - A_stl[j].resize(N); - for (int i=0;i<N;i++) - A_stl[j][i] = A[i+N*j]; - } - } - - static inline void copy_vector(const gene_vector & source, gene_vector & cible, int N){ - for (int i=0;i<N;i++) - cible[i]=source[i]; - } - - static inline void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N){ - for (int j=0;j<N;j++){ - for (int i=0;i<N;i++){ - cible[i+N*j] = source[i+N*j]; - } - } - } - -}; - - -#endif diff --git a/bench/btl/libs/f77/main.cpp b/bench/btl/libs/f77/main.cpp deleted file mode 100644 index a4b67b487..000000000 --- a/bench/btl/libs/f77/main.cpp +++ /dev/null @@ -1,47 +0,0 @@ -//===================================================== -// File : main.cpp -// Author : L. Plagne <laurent.plagne@edf.fr)> -// Copyright (C) EDF R&D, lun sep 30 14:23:25 CEST 2002 -//===================================================== -// -// This program is free software; you can redistribute it and/or -// modify it under the terms of the GNU General Public License -// as published by the Free Software Foundation; either version 2 -// of the License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -// -#include "utilities.h" -#include "f77_interface.hh" -#include "bench.hh" -#include "action_matrix_vector_product.hh" -#include "action_matrix_matrix_product.hh" -#include "action_axpy.hh" -#include "action_lu_solve.hh" -#include "action_ata_product.hh" -#include "action_aat_product.hh" - -BTL_MAIN; - -int main() -{ - bench<Action_axpy<f77_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT); - - bench<Action_matrix_vector_product<f77_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); - - bench<Action_matrix_matrix_product<f77_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); - - bench<Action_ata_product<f77_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); - - bench<Action_aat_product<f77_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); - - return 0; -} - - diff --git a/bench/btl/libs/f77/saat.f b/bench/btl/libs/f77/saat.f deleted file mode 100644 index 5d1855d2c..000000000 --- a/bench/btl/libs/f77/saat.f +++ /dev/null @@ -1,14 +0,0 @@ - SUBROUTINE SAAT(A,X,N) -** -** X = AT * A - REAL*4 A(N,N),X(N,N) - DO 20 I=1,N - DO 20 J=1,N - R=0. - DO 10 K=1,N - R=R+A(I,K)*A(J,K) - 10 CONTINUE - X(I,J)=R - 20 CONTINUE - RETURN - END diff --git a/bench/btl/libs/f77/sata.f b/bench/btl/libs/f77/sata.f deleted file mode 100644 index 3ab83d958..000000000 --- a/bench/btl/libs/f77/sata.f +++ /dev/null @@ -1,14 +0,0 @@ - SUBROUTINE SATA(A,X,N) -** -** X = AT * A - REAL*4 A(N,N),X(N,N) - DO 20 I=1,N - DO 20 J=1,N - R=0. - DO 10 K=1,N - R=R+A(K,I)*A(K,J) - 10 CONTINUE - X(I,J)=R - 20 CONTINUE - RETURN - END diff --git a/bench/btl/libs/f77/saxpy.f b/bench/btl/libs/f77/saxpy.f deleted file mode 100644 index d0f74fd70..000000000 --- a/bench/btl/libs/f77/saxpy.f +++ /dev/null @@ -1,16 +0,0 @@ - SUBROUTINE SAXPYF(N,A,X,Y) -** *************************************** -** CALCULE Y = Y + A*X -** *************************************** -*>N NOMBRE D'OPERATIONS A FAIRE -*>A CONSTANTE MULTIPLICATIVE -*>X TABLEAU -*=Y TABLEAU DES RESULTATS -*A R. SANCHEZ ( EARLY WINTER 1987 ) -*V M.F. ROBEAU - DIMENSION X(1),Y(1) - DO 10 I=1,N - Y(I)=Y(I)+A*X(I) - 10 CONTINUE - RETURN - END diff --git a/bench/btl/libs/f77/smxm.f b/bench/btl/libs/f77/smxm.f deleted file mode 100644 index a1e63adca..000000000 --- a/bench/btl/libs/f77/smxm.f +++ /dev/null @@ -1,32 +0,0 @@ - SUBROUTINE SMXM(A,N,B,M,C,L) -** -** C = A * B -** A ET B MATRICES A(N,M) B(M,L) ==> C(N,L) -** -*>A PREMIERE MATRICE -*>N PREMIERE DIMENSION DE A ET DE C -*>B DEUXIEME MATRICE -*>M DEUXIEME DIMENSION DE A ET PERMIERE DE B -*<C MATRICE PRODUIT DE A ET DE B -*>L DEUXIEME DIMENSION DE B ET DE C -*A R. SANCHEZ ( EARLY WINTER 1987 ) -*V M.F. ROBEAU -*M AM BAUDRON - AVRIL 94 -*: ERREUR DANS L'APPEL A L'UTILITAIRE SGEMM -*: APPEL A L'UTILITAIRE SGEMM DE LA LIBRAIRIE BLAS SUR HP -*M AM BAUDRON - NOVEMBRE 1991 -*: ERREUR ( SOMME SUR LES TERMES PAS FAITE ) -*: APPEL A L'UTILITAIRE SGEMM DE LA LIBRAIRIE BLAS SUR RISC -*M AM BAUDRON - MAI 1993 -*: CHANGEMENT DES %IF LOCAL SUN MIPS SUITE A INTRODUCTION VERSION IBM - DIMENSION A(N,M),B(M,L),C(N,L) - DO 20 I=1,N - DO 20 J=1,L - R=0. - DO 10 K=1,M - R=R+A(I,K)*B(K,J) - 10 CONTINUE - C(I,J)=R - 20 CONTINUE - RETURN - END diff --git a/bench/btl/libs/f77/smxv.f b/bench/btl/libs/f77/smxv.f deleted file mode 100644 index d2f7ed24e..000000000 --- a/bench/btl/libs/f77/smxv.f +++ /dev/null @@ -1,38 +0,0 @@ - SUBROUTINE SMXV(A,N,X,M,R) -C -** -** VERSION DOUBLE PRECISION DE MXV -** R = A * X -** A MATRICE A(N,M) -** R ET X VECTEURS -** -*>A PREMIERE MATRICE -*>N PREMIERE DIMENSION DE A -*>X VECTEUR -*>M DEUXIEME DIMENSION DE A -*<R VECTEUR PRODUIT DE A ET DE X -** -*A M. COSTE -*V M.F. ROBEAU -*M -* - REAL*4 X(1),R(1),A(N,M) - REAL*4 S -C DO 20 I=1,N -C S=0. -C DO 10 J=1,M -C S=S+A(I,J)*X(J) -C 10 CONTINUE -C R(I)=S -C 20 CONTINUE - DO 5 I=1,N - R(I)=0 - 5 CONTINUE - DO 10 J=1,M - S=X(J) - DO 20 I=1,N - R(I)=R(I)+A(I,J)*S - 20 CONTINUE - 10 CONTINUE - RETURN - END diff --git a/bench/btl/libs/f77/test_interface.hh b/bench/btl/libs/f77/test_interface.hh deleted file mode 100644 index 230c8dbc8..000000000 --- a/bench/btl/libs/f77/test_interface.hh +++ /dev/null @@ -1,36 +0,0 @@ -//===================================================== -// File : test_interface.hh -// Author : L. Plagne <laurent.plagne@edf.fr)> -// Copyright (C) EDF R&D, lun sep 30 14:23:25 CEST 2002 -//===================================================== -// -// This program is free software; you can redistribute it and/or -// modify it under the terms of the GNU General Public License -// as published by the Free Software Foundation; either version 2 -// of the License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -// -#ifndef TEST_INTERFACE_HH -#define TEST_INTERFACE_HH - -template< class Interface > -void test_interface( void ){ - - Interface::interface_name(); - - typename Interface::gene_matrix A; - - - - - -} - -#endif diff --git a/bench/btl/libs/hand_vec/CMakeLists.txt b/bench/btl/libs/hand_vec/CMakeLists.txt deleted file mode 100644 index 3fffbfe8a..000000000 --- a/bench/btl/libs/hand_vec/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -find_package(Eigen2) -if (EIGEN2_FOUND) - - include_directories(${EIGEN2_INCLUDE_DIR} ${PROJECT_SOURCE_DIR}/libs/f77) - btl_add_bench(btl_hand_vec main.cpp OFF) - - btl_add_bench(btl_hand_peeling main.cpp OFF) - if (BUILD_btl_hand_peeling) - set_target_properties(btl_hand_peeling PROPERTIES COMPILE_FLAGS "-DPEELING") - endif (BUILD_btl_hand_peeling) - -endif (EIGEN2_FOUND)
\ No newline at end of file diff --git a/bench/btl/libs/hand_vec/hand_vec_interface.hh b/bench/btl/libs/hand_vec/hand_vec_interface.hh deleted file mode 100755 index 0bb4b64ca..000000000 --- a/bench/btl/libs/hand_vec/hand_vec_interface.hh +++ /dev/null @@ -1,886 +0,0 @@ -//===================================================== -// File : hand_vec_interface.hh -// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr> -//===================================================== -// -// This program is free software; you can redistribute it and/or -// modify it under the terms of the GNU General Public License -// as published by the Free Software Foundation; either version 2 -// of the License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -// -#ifndef HAND_VEC_INTERFACE_HH -#define HAND_VEC_INTERFACE_HH - -#include <Eigen/Core> -#include "f77_interface.hh" - -using namespace Eigen; - -template<class real> -class hand_vec_interface : public f77_interface_base<real> { - -public : - - typedef typename internal::packet_traits<real>::type Packet; - static const int PacketSize = internal::packet_traits<real>::size; - - typedef typename f77_interface_base<real>::stl_matrix stl_matrix; - typedef typename f77_interface_base<real>::stl_vector stl_vector; - typedef typename f77_interface_base<real>::gene_matrix gene_matrix; - typedef typename f77_interface_base<real>::gene_vector gene_vector; - - static void free_matrix(gene_matrix & A, int N){ - internal::aligned_free(A); - } - - static void free_vector(gene_vector & B){ - internal::aligned_free(B); - } - - static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ - int N = A_stl.size(); - A = (real*)internal::aligned_malloc(N*N*sizeof(real)); - for (int j=0;j<N;j++) - for (int i=0;i<N;i++) - A[i+N*j] = A_stl[j][i]; - } - - static inline void vector_from_stl(gene_vector & B, stl_vector & B_stl){ - int N = B_stl.size(); - B = (real*)internal::aligned_malloc(N*sizeof(real)); - for (int i=0;i<N;i++) - B[i] = B_stl[i]; - } - - static inline std::string name() { - #ifdef PEELING - return "hand_vectorized_peeling"; - #else - return "hand_vectorized"; - #endif - } - - static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) - { - asm("#begin matrix_vector_product"); - int AN = (N/PacketSize)*PacketSize; - int ANP = (AN/(2*PacketSize))*2*PacketSize; - int bound = (N/4)*4; - for (int i=0;i<N;i++) - X[i] = 0; - - for (int i=0;i<bound;i+=4) - { - register real* __restrict__ A0 = A + i*N; - register real* __restrict__ A1 = A + (i+1)*N; - register real* __restrict__ A2 = A + (i+2)*N; - register real* __restrict__ A3 = A + (i+3)*N; - - Packet ptmp0 = internal::pset1(B[i]); - Packet ptmp1 = internal::pset1(B[i+1]); - Packet ptmp2 = internal::pset1(B[i+2]); - Packet ptmp3 = internal::pset1(B[i+3]); -// register Packet ptmp0, ptmp1, ptmp2, ptmp3; -// asm( -// -// "movss (%[B],%[j],4), %[ptmp0] \n\t" -// "shufps $0,%[ptmp0],%[ptmp0] \n\t" -// "movss 4(%[B],%[j],4), %[ptmp1] \n\t" -// "shufps $0,%[ptmp1],%[ptmp1] \n\t" -// "movss 8(%[B],%[j],4), %[ptmp2] \n\t" -// "shufps $0,%[ptmp2],%[ptmp2] \n\t" -// "movss 12(%[B],%[j],4), %[ptmp3] \n\t" -// "shufps $0,%[ptmp3],%[ptmp3] \n\t" -// : [ptmp0] "=x" (ptmp0), -// [ptmp1] "=x" (ptmp1), -// [ptmp2] "=x" (ptmp2), -// [ptmp3] "=x" (ptmp3) -// : [B] "r" (B), -// [j] "r" (size_t(i)) -// : ); - - if (AN>0) - { -// for (size_t j = 0;j<ANP;j+=8) -// { -// asm( -// -// "movaps (%[A0],%[j],4), %%xmm8 \n\t" -// "movaps 16(%[A0],%[j],4), %%xmm12 \n\t" -// "movups (%[A3],%[j],4), %%xmm11 \n\t" -// "movups 16(%[A3],%[j],4), %%xmm15 \n\t" -// "movups (%[A2],%[j],4), %%xmm10 \n\t" -// "movups 16(%[A2],%[j],4), %%xmm14 \n\t" -// "movups (%[A1],%[j],4), %%xmm9 \n\t" -// "movups 16(%[A1],%[j],4), %%xmm13 \n\t" -// -// "mulps %[ptmp0], %%xmm8 \n\t" -// "addps (%[res0],%[j],4), %%xmm8 \n\t" -// "mulps %[ptmp3], %%xmm11 \n\t" -// "addps %%xmm11, %%xmm8 \n\t" -// "mulps %[ptmp2], %%xmm10 \n\t" -// "addps %%xmm10, %%xmm8 \n\t" -// "mulps %[ptmp1], %%xmm9 \n\t" -// "addps %%xmm9, %%xmm8 \n\t" -// "movaps %%xmm8, (%[res0],%[j],4) \n\t" -// -// "mulps %[ptmp0], %%xmm12 \n\t" -// "addps 16(%[res0],%[j],4), %%xmm12 \n\t" -// "mulps %[ptmp3], %%xmm15 \n\t" -// "addps %%xmm15, %%xmm12 \n\t" -// "mulps %[ptmp2], %%xmm14 \n\t" -// "addps %%xmm14, %%xmm12 \n\t" -// "mulps %[ptmp1], %%xmm13 \n\t" -// "addps %%xmm13, %%xmm12 \n\t" -// "movaps %%xmm12, 16(%[res0],%[j],4) \n\t" -// : -// : [res0] "r" (X), [j] "r" (j),[A0] "r" (A0), -// [A1] "r" (A1), -// [A2] "r" (A2), -// [A3] "r" (A3), -// [ptmp0] "x" (ptmp0), -// [ptmp1] "x" (ptmp1), -// [ptmp2] "x" (ptmp2), -// [ptmp3] "x" (ptmp3) -// : "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "%r14"); -// } - register Packet A00; - register Packet A01; - register Packet A02; - register Packet A03; - register Packet A10; - register Packet A11; - register Packet A12; - register Packet A13; - for (int j = 0;j<ANP;j+=2*PacketSize) - { -// A00 = internal::pload(&A0[j]); -// A01 = internal::ploadu(&A1[j]); -// A02 = internal::ploadu(&A2[j]); -// A03 = internal::ploadu(&A3[j]); -// A10 = internal::pload(&A0[j+PacketSize]); -// A11 = internal::ploadu(&A1[j+PacketSize]); -// A12 = internal::ploadu(&A2[j+PacketSize]); -// A13 = internal::ploadu(&A3[j+PacketSize]); -// -// A00 = internal::pmul(ptmp0, A00); -// A01 = internal::pmul(ptmp1, A01); -// A02 = internal::pmul(ptmp2, A02); -// A03 = internal::pmul(ptmp3, A03); -// A10 = internal::pmul(ptmp0, A10); -// A11 = internal::pmul(ptmp1, A11); -// A12 = internal::pmul(ptmp2, A12); -// A13 = internal::pmul(ptmp3, A13); -// -// A00 = internal::padd(A00,A01); -// A02 = internal::padd(A02,A03); -// A00 = internal::padd(A00,internal::pload(&X[j])); -// A00 = internal::padd(A00,A02); -// internal::pstore(&X[j],A00); -// -// A10 = internal::padd(A10,A11); -// A12 = internal::padd(A12,A13); -// A10 = internal::padd(A10,internal::pload(&X[j+PacketSize])); -// A10 = internal::padd(A10,A12); -// internal::pstore(&X[j+PacketSize],A10); - - internal::pstore(&X[j], - internal::padd(internal::pload(&X[j]), - internal::padd( - internal::padd(internal::pmul(ptmp0,internal::pload(&A0[j])),internal::pmul(ptmp1,internal::ploadu(&A1[j]))), - internal::padd(internal::pmul(ptmp2,internal::ploadu(&A2[j])),internal::pmul(ptmp3,internal::ploadu(&A3[j]))) ))); - - internal::pstore(&X[j+PacketSize], - internal::padd(internal::pload(&X[j+PacketSize]), - internal::padd( - internal::padd(internal::pmul(ptmp0,internal::pload(&A0[j+PacketSize])),internal::pmul(ptmp1,internal::ploadu(&A1[j+PacketSize]))), - internal::padd(internal::pmul(ptmp2,internal::ploadu(&A2[j+PacketSize])),internal::pmul(ptmp3,internal::ploadu(&A3[j+PacketSize]))) ))); - } - for (int j = ANP;j<AN;j+=PacketSize) - internal::pstore(&X[j], - internal::padd(internal::pload(&X[j]), - internal::padd( - internal::padd(internal::pmul(ptmp0,internal::pload(&A0[j])),internal::pmul(ptmp1,internal::ploadu(&A1[j]))), - internal::padd(internal::pmul(ptmp2,internal::ploadu(&A2[j])),internal::pmul(ptmp3,internal::ploadu(&A3[j]))) ))); - } - // process remaining scalars - for (int j=AN;j<N;j++) - X[j] += internal::pfirst(ptmp0) * A0[j] + internal::pfirst(ptmp1) * A1[j] + internal::pfirst(ptmp2) * A2[j] + internal::pfirst(ptmp3) * A3[j]; - } - for (int i=bound;i<N;i++) - { - real tmp0 = B[i]; - Packet ptmp0 = internal::pset1(tmp0); - int iN0 = i*N; - if (AN>0) - { - bool aligned0 = (iN0 % PacketSize) == 0; - if (aligned0) - for (int j = 0;j<AN;j+=PacketSize) - internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pload(&X[j]))); - else - for (int j = 0;j<AN;j+=PacketSize) - internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])),internal::pload(&X[j]))); - } - // process remaining scalars - for (int j=AN;j<N;j++) - X[j] += tmp0 * A[j+iN0]; - } - asm("#end matrix_vector_product"); - } - - static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) - { - -// int AN = (N/PacketSize)*PacketSize; -// int ANP = (AN/(2*PacketSize))*2*PacketSize; -// int bound = (N/4)*4; - for (int i=0;i<N;i++) - X[i] = 0; - - int bound = std::max(0,N-8) & 0xfffffffE; - - for (int j=0;j<bound;j+=2) - { - register real* __restrict__ A0 = A + j*N; - register real* __restrict__ A1 = A + (j+1)*N; - - real t0 = B[j]; - Packet ptmp0 = internal::pset1(t0); - real t1 = B[j+1]; - Packet ptmp1 = internal::pset1(t1); - - real t2 = 0; - Packet ptmp2 = internal::pset1(t2); - real t3 = 0; - Packet ptmp3 = internal::pset1(t3); - - int starti = j+2; - int alignedEnd = starti; - int alignedStart = (starti) + internal::first_aligned(&X[starti], N-starti); - alignedEnd = alignedStart + ((N-alignedStart)/(PacketSize))*(PacketSize); - - X[j] += t0 * A0[j]; - X[j+1] += t1 * A1[j]; - - X[j+1] += t0 * A0[j+1]; - t2 += A0[j+1] * B[j+1]; - -// alignedStart = alignedEnd; - for (int i=starti; i<alignedStart; ++i) { - X[i] += t0 * A0[i] + t1 * A1[i]; - t2 += A0[i] * B[i]; - t3 += A1[i] * B[i]; - } - asm("#begin symv"); - for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize) { - Packet A0i = internal::ploadu(&A0[i]); - Packet A1i = internal::ploadu(&A1[i]); -// Packet A0i1 = internal::ploadu(&A0[i+PacketSize]); - Packet Xi = internal::pload(&X[i]); - Packet Bi = internal::pload/*u*/(&B[i]); -// Packet Xi1 = internal::pload(&X[i+PacketSize]); -// Packet Bi1 = internal::pload/*u*/(&B[i+PacketSize]); - Xi = internal::padd(internal::padd(Xi, internal::pmul(ptmp0, A0i)), internal::pmul(ptmp1, A1i)); - ptmp2 = internal::padd(ptmp2, internal::pmul(A0i, Bi)); - ptmp3 = internal::padd(ptmp3, internal::pmul(A1i, Bi)); -// Xi1 = internal::padd(Xi1, internal::pmul(ptmp1, A0i1)); -// ptmp2 = internal::padd(ptmp2, internal::pmul(A0i1, Bi1)); -// - internal::pstore(&X[i],Xi); -// internal::pstore(&X[i+PacketSize],Xi1); -// asm( -// "prefetchnta 64(%[A0],%[i],4) \n\t" -// //"movups (%[A0],%[i],4), %%xmm8 \n\t" -// "movsd (%[A0],%[i],4), %%xmm8 \n\t" -// "movhps 8(%[A0],%[i],4), %%xmm8 \n\t" -// // "movups 16(%[A0],%[i],4), %%xmm9 \n\t" -// // "movups 64(%[A0],%[i],4), %%xmm15 \n\t" -// "movaps (%[B], %[i],4), %%xmm12 \n\t" -// // "movaps 16(%[B], %[i],4), %%xmm13 \n\t" -// "movaps (%[X], %[i],4), %%xmm10 \n\t" -// // "movaps 16(%[X], %[i],4), %%xmm11 \n\t" -// -// "mulps %%xmm8, %%xmm12 \n\t" -// // "mulps %%xmm9, %%xmm13 \n\t" -// -// "mulps %[ptmp1], %%xmm8 \n\t" -// "addps %%xmm12, %[ptmp2] \n\t" -// "addps %%xmm8, %%xmm10 \n\t" -// -// -// -// -// // "mulps %[ptmp1], %%xmm9 \n\t" -// -// // "addps %%xmm9, %%xmm11 \n\t" -// // "addps %%xmm13, %[ptmp2] \n\t" -// -// "movaps %%xmm10, (%[X],%[i],4) \n\t" -// // "movaps %%xmm11, 16(%[X],%[i],4) \n\t" -// : -// : [X] "r" (X), [i] "r" (i), [A0] "r" (A0), -// [B] "r" (B), -// [ptmp1] "x" (ptmp1), -// [ptmp2] "x" (ptmp2) -// : "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm15"); - } - asm("#end symv"); - for (int i=alignedEnd; i<N; i++) { - X[i] += t0 * A0[i] + t1 * A1[i]; - t2 += A0[i] * B[i]; - t3 += A1[i] * B[i]; - } - - - X[j] += t2 + internal::predux(ptmp2); - X[j+1] += t3 + internal::predux(ptmp3); - } - for (int j=bound;j<N;j++) - { - register real* __restrict__ A0 = A + j*N; - - real t1 = B[j]; - real t2 = 0; - X[j] += t1 * A0[j]; - for (int i=j+1; i<N; i+=PacketSize) { - X[i] += t1 * A0[i]; - t2 += A0[i] * B[i]; - } - X[j] += t2; - } - - } - -// static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) -// { -// asm("#begin matrix_vector_product"); -// int AN = (N/PacketSize)*PacketSize; -// int ANP = (AN/(2*PacketSize))*2*PacketSize; -// int bound = (N/4)*4; -// for (int i=0;i<N;i++) -// X[i] = 0; -// -// for (int i=0;i<bound;i+=4) -// { -// real tmp0 = B[i]; -// Packet ptmp0 = internal::pset1(tmp0); -// real tmp1 = B[i+1]; -// Packet ptmp1 = internal::pset1(tmp1); -// real tmp2 = B[i+2]; -// Packet ptmp2 = internal::pset1(tmp2); -// real tmp3 = B[i+3]; -// Packet ptmp3 = internal::pset1(tmp3); -// int iN0 = i*N; -// int iN1 = (i+1)*N; -// int iN2 = (i+2)*N; -// int iN3 = (i+3)*N; -// if (AN>0) -// { -// // int aligned0 = (iN0 % PacketSize); -// int aligned1 = (iN1 % PacketSize); -// -// if (aligned1==0) -// { -// for (int j = 0;j<AN;j+=PacketSize) -// { -// internal::pstore(&X[j], -// internal::padd(internal::pload(&X[j]), -// internal::padd( -// internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pmul(ptmp1,internal::pload(&A[j+iN1]))), -// internal::padd(internal::pmul(ptmp2,internal::pload(&A[j+iN2])),internal::pmul(ptmp3,internal::pload(&A[j+iN3]))) ))); -// } -// } -// else if (aligned1==2) -// { -// for (int j = 0;j<AN;j+=PacketSize) -// { -// internal::pstore(&X[j], -// internal::padd(internal::pload(&X[j]), -// internal::padd( -// internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+iN1]))), -// internal::padd(internal::pmul(ptmp2,internal::pload(&A[j+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+iN3]))) ))); -// } -// } -// else -// { -// for (int j = 0;j<ANP;j+=2*PacketSize) -// { -// internal::pstore(&X[j], -// internal::padd(internal::pload(&X[j]), -// internal::padd( -// internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+iN1]))), -// internal::padd(internal::pmul(ptmp2,internal::ploadu(&A[j+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+iN3]))) ))); -// -// internal::pstore(&X[j+PacketSize], -// internal::padd(internal::pload(&X[j+PacketSize]), -// internal::padd( -// internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+PacketSize+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+PacketSize+iN1]))), -// internal::padd(internal::pmul(ptmp2,internal::ploadu(&A[j+PacketSize+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+PacketSize+iN3]))) ))); -// -// // internal::pstore(&X[j+2*PacketSize], -// // internal::padd(internal::pload(&X[j+2*PacketSize]), -// // internal::padd( -// // internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+2*PacketSize+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+2*PacketSize+iN1]))), -// // internal::padd(internal::pmul(ptmp2,internal::ploadu(&A[j+2*PacketSize+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+2*PacketSize+iN3]))) ))); -// // -// // internal::pstore(&X[j+3*PacketSize], -// // internal::padd(internal::pload(&X[j+3*PacketSize]), -// // internal::padd( -// // internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+3*PacketSize+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+3*PacketSize+iN1]))), -// // internal::padd(internal::pmul(ptmp2,internal::ploadu(&A[j+3*PacketSize+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+3*PacketSize+iN3]))) ))); -// -// } -// for (int j = ANP;j<AN;j+=PacketSize) -// internal::pstore(&X[j], -// internal::padd(internal::pload(&X[j]), -// internal::padd( -// internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])),internal::pmul(ptmp1,internal::ploadu(&A[j+iN1]))), -// internal::padd(internal::pmul(ptmp2,internal::ploadu(&A[j+iN2])),internal::pmul(ptmp3,internal::ploadu(&A[j+iN3]))) ))); -// } -// } -// // process remaining scalars -// for (int j=AN;j<N;j++) -// X[j] += tmp0 * A[j+iN0] + tmp1 * A[j+iN1] + tmp2 * A[j+iN2] + tmp3 * A[j+iN3]; -// } -// for (int i=bound;i<N;i++) -// { -// real tmp0 = B[i]; -// Packet ptmp0 = internal::pset1(tmp0); -// int iN0 = i*N; -// if (AN>0) -// { -// bool aligned0 = (iN0 % PacketSize) == 0; -// if (aligned0) -// for (int j = 0;j<AN;j+=PacketSize) -// internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pload(&X[j]))); -// else -// for (int j = 0;j<AN;j+=PacketSize) -// internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])),internal::pload(&X[j]))); -// } -// // process remaining scalars -// for (int j=AN;j<N;j++) -// X[j] += tmp0 * A[j+iN0]; -// } -// asm("#end matrix_vector_product"); -// } - -// static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) -// { -// asm("#begin matrix_vector_product"); -// int AN = (N/PacketSize)*PacketSize; -// for (int i=0;i<N;i++) -// X[i] = 0; -// -// for (int i=0;i<N;i+=2) -// { -// real tmp0 = B[i]; -// Packet ptmp0 = internal::pset1(tmp0); -// real tmp1 = B[i+1]; -// Packet ptmp1 = internal::pset1(tmp1); -// int iN0 = i*N; -// int iN1 = (i+1)*N; -// if (AN>0) -// { -// bool aligned0 = (iN0 % PacketSize) == 0; -// bool aligned1 = (iN1 % PacketSize) == 0; -// -// if (aligned0 && aligned1) -// { -// for (int j = 0;j<AN;j+=PacketSize) -// { -// internal::pstore(&X[j], -// internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])), -// internal::padd(internal::pmul(ptmp1,internal::pload(&A[j+iN1])),internal::pload(&X[j])))); -// } -// } -// else if (aligned0) -// { -// for (int j = 0;j<AN;j+=PacketSize) -// { -// internal::pstore(&X[j], -// internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])), -// internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+iN1])),internal::pload(&X[j])))); -// } -// } -// else if (aligned1) -// { -// for (int j = 0;j<AN;j+=PacketSize) -// { -// internal::pstore(&X[j], -// internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])), -// internal::padd(internal::pmul(ptmp1,internal::pload(&A[j+iN1])),internal::pload(&X[j])))); -// } -// } -// else -// { -// int ANP = (AN/(4*PacketSize))*4*PacketSize; -// for (int j = 0;j<ANP;j+=4*PacketSize) -// { -// internal::pstore(&X[j], -// internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])), -// internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+iN1])),internal::pload(&X[j])))); -// -// internal::pstore(&X[j+PacketSize], -// internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+PacketSize+iN0])), -// internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+PacketSize+iN1])),internal::pload(&X[j+PacketSize])))); -// -// internal::pstore(&X[j+2*PacketSize], -// internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+2*PacketSize+iN0])), -// internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+2*PacketSize+iN1])),internal::pload(&X[j+2*PacketSize])))); -// -// internal::pstore(&X[j+3*PacketSize], -// internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+3*PacketSize+iN0])), -// internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+3*PacketSize+iN1])),internal::pload(&X[j+3*PacketSize])))); -// } -// for (int j = ANP;j<AN;j+=PacketSize) -// internal::pstore(&X[j], -// internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])), -// internal::padd(internal::pmul(ptmp1,internal::ploadu(&A[j+iN1])),internal::pload(&X[j])))); -// } -// } -// // process remaining scalars -// for (int j=AN;j<N;j++) -// X[j] += tmp0 * A[j+iN0] + tmp1 * A[j+iN1]; -// } -// int remaining = (N/2)*2; -// for (int i=remaining;i<N;i++) -// { -// real tmp0 = B[i]; -// Packet ptmp0 = internal::pset1(tmp0); -// int iN0 = i*N; -// if (AN>0) -// { -// bool aligned0 = (iN0 % PacketSize) == 0; -// if (aligned0) -// for (int j = 0;j<AN;j+=PacketSize) -// internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::pload(&A[j+iN0])),internal::pload(&X[j]))); -// else -// for (int j = 0;j<AN;j+=PacketSize) -// internal::pstore(&X[j], internal::padd(internal::pmul(ptmp0,internal::ploadu(&A[j+iN0])),internal::pload(&X[j]))); -// } -// // process remaining scalars -// for (int j=AN;j<N;j++) -// X[j] += tmp0 * A[j+iN0]; -// } -// asm("#end matrix_vector_product"); -// } - -// static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) -// { -// asm("#begin matrix_vector_product"); -// int AN = (N/PacketSize)*PacketSize; -// for (int i=0;i<N;i++) -// X[i] = 0; -// for (int i=0;i<N;i++) -// { -// real tmp = B[i]; -// Packet ptmp = internal::pset1(tmp); -// int iN = i*N; -// if (AN>0) -// { -// bool aligned = (iN % PacketSize) == 0; -// if (aligned) -// { -// #ifdef PEELING -// Packet A0, A1, A2, X0, X1, X2; -// int ANP = (AN/(8*PacketSize))*8*PacketSize; -// for (int j = 0;j<ANP;j+=PacketSize*8) -// { -// A0 = internal::pload(&A[j+iN]); -// X0 = internal::pload(&X[j]); -// A1 = internal::pload(&A[j+PacketSize+iN]); -// X1 = internal::pload(&X[j+PacketSize]); -// A2 = internal::pload(&A[j+2*PacketSize+iN]); -// X2 = internal::pload(&X[j+2*PacketSize]); -// internal::pstore(&X[j], internal::padd(X0, internal::pmul(ptmp,A0))); -// A0 = internal::pload(&A[j+3*PacketSize+iN]); -// X0 = internal::pload(&X[j+3*PacketSize]); -// internal::pstore(&X[j+PacketSize], internal::padd(internal::pload(&X1), internal::pmul(ptmp,A1))); -// A1 = internal::pload(&A[j+4*PacketSize+iN]); -// X1 = internal::pload(&X[j+4*PacketSize]); -// internal::pstore(&X[j+2*PacketSize], internal::padd(internal::pload(&X2), internal::pmul(ptmp,A2))); -// A2 = internal::pload(&A[j+5*PacketSize+iN]); -// X2 = internal::pload(&X[j+5*PacketSize]); -// internal::pstore(&X[j+3*PacketSize], internal::padd(internal::pload(&X0), internal::pmul(ptmp,A0))); -// A0 = internal::pload(&A[j+6*PacketSize+iN]); -// X0 = internal::pload(&X[j+6*PacketSize]); -// internal::pstore(&X[j+4*PacketSize], internal::padd(internal::pload(&X1), internal::pmul(ptmp,A1))); -// A1 = internal::pload(&A[j+7*PacketSize+iN]); -// X1 = internal::pload(&X[j+7*PacketSize]); -// internal::pstore(&X[j+5*PacketSize], internal::padd(internal::pload(&X2), internal::pmul(ptmp,A2))); -// internal::pstore(&X[j+6*PacketSize], internal::padd(internal::pload(&X0), internal::pmul(ptmp,A0))); -// internal::pstore(&X[j+7*PacketSize], internal::padd(internal::pload(&X1), internal::pmul(ptmp,A1))); -// // -// // internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::pload(&A[j+iN])))); -// // internal::pstore(&X[j+PacketSize], internal::padd(internal::pload(&X[j+PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+PacketSize+iN])))); -// // internal::pstore(&X[j+2*PacketSize], internal::padd(internal::pload(&X[j+2*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+2*PacketSize+iN])))); -// // internal::pstore(&X[j+3*PacketSize], internal::padd(internal::pload(&X[j+3*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+3*PacketSize+iN])))); -// // internal::pstore(&X[j+4*PacketSize], internal::padd(internal::pload(&X[j+4*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+4*PacketSize+iN])))); -// // internal::pstore(&X[j+5*PacketSize], internal::padd(internal::pload(&X[j+5*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+5*PacketSize+iN])))); -// // internal::pstore(&X[j+6*PacketSize], internal::padd(internal::pload(&X[j+6*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+6*PacketSize+iN])))); -// // internal::pstore(&X[j+7*PacketSize], internal::padd(internal::pload(&X[j+7*PacketSize]), internal::pmul(ptmp,internal::pload(&A[j+7*PacketSize+iN])))); -// } -// for (int j = ANP;j<AN;j+=PacketSize) -// internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::pload(&A[j+iN])))); -// #else -// for (int j = 0;j<AN;j+=PacketSize) -// internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::pload(&A[j+iN])))); -// #endif -// } -// else -// { -// #ifdef PEELING -// int ANP = (AN/(8*PacketSize))*8*PacketSize; -// for (int j = 0;j<ANP;j+=PacketSize*8) -// { -// internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::ploadu(&A[j+iN])))); -// internal::pstore(&X[j+PacketSize], internal::padd(internal::pload(&X[j+PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+PacketSize+iN])))); -// internal::pstore(&X[j+2*PacketSize], internal::padd(internal::pload(&X[j+2*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+2*PacketSize+iN])))); -// internal::pstore(&X[j+3*PacketSize], internal::padd(internal::pload(&X[j+3*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+3*PacketSize+iN])))); -// internal::pstore(&X[j+4*PacketSize], internal::padd(internal::pload(&X[j+4*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+4*PacketSize+iN])))); -// internal::pstore(&X[j+5*PacketSize], internal::padd(internal::pload(&X[j+5*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+5*PacketSize+iN])))); -// internal::pstore(&X[j+6*PacketSize], internal::padd(internal::pload(&X[j+6*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+6*PacketSize+iN])))); -// internal::pstore(&X[j+7*PacketSize], internal::padd(internal::pload(&X[j+7*PacketSize]), internal::pmul(ptmp,internal::ploadu(&A[j+7*PacketSize+iN])))); -// } -// for (int j = ANP;j<AN;j+=PacketSize) -// internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::ploadu(&A[j+iN])))); -// #else -// for (int j = 0;j<AN;j+=PacketSize) -// internal::pstore(&X[j], internal::padd(internal::pload(&X[j]), internal::pmul(ptmp,internal::ploadu(&A[j+iN])))); -// #endif -// } -// } -// // process remaining scalars -// for (int j=AN;j<N;j++) -// X[j] += tmp * A[j+iN]; -// } -// asm("#end matrix_vector_product"); -// } - - static inline void atv_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) - { - int AN = (N/PacketSize)*PacketSize; - int bound = (N/4)*4; - for (int i=0;i<bound;i+=4) - { - real tmp0 = 0; - Packet ptmp0 = internal::pset1(real(0)); - real tmp1 = 0; - Packet ptmp1 = internal::pset1(real(0)); - real tmp2 = 0; - Packet ptmp2 = internal::pset1(real(0)); - real tmp3 = 0; - Packet ptmp3 = internal::pset1(real(0)); - int iN0 = i*N; - int iN1 = (i+1)*N; - int iN2 = (i+2)*N; - int iN3 = (i+3)*N; - if (AN>0) - { - int align1 = (iN1 % PacketSize); - if (align1==0) - { - for (int j = 0;j<AN;j+=PacketSize) - { - Packet b = internal::pload(&B[j]); - ptmp0 = internal::padd(ptmp0, internal::pmul(b, internal::pload(&A[j+iN0]))); - ptmp1 = internal::padd(ptmp1, internal::pmul(b, internal::pload(&A[j+iN1]))); - ptmp2 = internal::padd(ptmp2, internal::pmul(b, internal::pload(&A[j+iN2]))); - ptmp3 = internal::padd(ptmp3, internal::pmul(b, internal::pload(&A[j+iN3]))); - } - } - else if (align1==2) - { - for (int j = 0;j<AN;j+=PacketSize) - { - Packet b = internal::pload(&B[j]); - ptmp0 = internal::padd(ptmp0, internal::pmul(b, internal::pload(&A[j+iN0]))); - ptmp1 = internal::padd(ptmp1, internal::pmul(b, internal::ploadu(&A[j+iN1]))); - ptmp2 = internal::padd(ptmp2, internal::pmul(b, internal::pload(&A[j+iN2]))); - ptmp3 = internal::padd(ptmp3, internal::pmul(b, internal::ploadu(&A[j+iN3]))); - } - } - else - { - for (int j = 0;j<AN;j+=PacketSize) - { - Packet b = internal::pload(&B[j]); - ptmp0 = internal::padd(ptmp0, internal::pmul(b, internal::pload(&A[j+iN0]))); - ptmp1 = internal::padd(ptmp1, internal::pmul(b, internal::ploadu(&A[j+iN1]))); - ptmp2 = internal::padd(ptmp2, internal::pmul(b, internal::ploadu(&A[j+iN2]))); - ptmp3 = internal::padd(ptmp3, internal::pmul(b, internal::ploadu(&A[j+iN3]))); - } - } - tmp0 = internal::predux(ptmp0); - tmp1 = internal::predux(ptmp1); - tmp2 = internal::predux(ptmp2); - tmp3 = internal::predux(ptmp3); - } - // process remaining scalars - for (int j=AN;j<N;j++) - { - tmp0 += B[j] * A[j+iN0]; - tmp1 += B[j] * A[j+iN1]; - tmp2 += B[j] * A[j+iN2]; - tmp3 += B[j] * A[j+iN3]; - } - X[i+0] = tmp0; - X[i+1] = tmp1; - X[i+2] = tmp2; - X[i+3] = tmp3; - } - - for (int i=bound;i<N;i++) - { - real tmp0 = 0; - Packet ptmp0 = internal::pset1(real(0)); - int iN0 = i*N; - if (AN>0) - { - if (iN0 % PacketSize==0) - for (int j = 0;j<AN;j+=PacketSize) - ptmp0 = internal::padd(ptmp0, internal::pmul(internal::pload(&B[j]), internal::pload(&A[j+iN0]))); - else - for (int j = 0;j<AN;j+=PacketSize) - ptmp0 = internal::padd(ptmp0, internal::pmul(internal::pload(&B[j]), internal::ploadu(&A[j+iN0]))); - tmp0 = internal::predux(ptmp0); - } - // process remaining scalars - for (int j=AN;j<N;j++) - tmp0 += B[j] * A[j+iN0]; - X[i+0] = tmp0; - } - } - -// static inline void atv_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N) -// { -// int AN = (N/PacketSize)*PacketSize; -// for (int i=0;i<N;i++) -// X[i] = 0; -// for (int i=0;i<N;i++) -// { -// real tmp = 0; -// Packet ptmp = internal::pset1(real(0)); -// int iN = i*N; -// if (AN>0) -// { -// bool aligned = (iN % PacketSize) == 0; -// if (aligned) -// { -// #ifdef PEELING -// int ANP = (AN/(8*PacketSize))*8*PacketSize; -// for (int j = 0;j<ANP;j+=PacketSize*8) -// { -// ptmp = -// internal::padd(internal::pmul(internal::pload(&B[j]), internal::pload(&A[j+iN])), -// internal::padd(internal::pmul(internal::pload(&B[j+PacketSize]), internal::pload(&A[j+PacketSize+iN])), -// internal::padd(internal::pmul(internal::pload(&B[j+2*PacketSize]), internal::pload(&A[j+2*PacketSize+iN])), -// internal::padd(internal::pmul(internal::pload(&B[j+3*PacketSize]), internal::pload(&A[j+3*PacketSize+iN])), -// internal::padd(internal::pmul(internal::pload(&B[j+4*PacketSize]), internal::pload(&A[j+4*PacketSize+iN])), -// internal::padd(internal::pmul(internal::pload(&B[j+5*PacketSize]), internal::pload(&A[j+5*PacketSize+iN])), -// internal::padd(internal::pmul(internal::pload(&B[j+6*PacketSize]), internal::pload(&A[j+6*PacketSize+iN])), -// internal::padd(internal::pmul(internal::pload(&B[j+7*PacketSize]), internal::pload(&A[j+7*PacketSize+iN])), -// ptmp)))))))); -// } -// for (int j = ANP;j<AN;j+=PacketSize) -// ptmp = internal::padd(ptmp, internal::pmul(internal::pload(&B[j]), internal::pload(&A[j+iN]))); -// #else -// for (int j = 0;j<AN;j+=PacketSize) -// ptmp = internal::padd(ptmp, internal::pmul(internal::pload(&B[j]), internal::pload(&A[j+iN]))); -// #endif -// } -// else -// { -// #ifdef PEELING -// int ANP = (AN/(8*PacketSize))*8*PacketSize; -// for (int j = 0;j<ANP;j+=PacketSize*8) -// { -// ptmp = -// internal::padd(internal::pmul(internal::pload(&B[j]), internal::ploadu(&A[j+iN])), -// internal::padd(internal::pmul(internal::pload(&B[j+PacketSize]), internal::ploadu(&A[j+PacketSize+iN])), -// internal::padd(internal::pmul(internal::pload(&B[j+2*PacketSize]), internal::ploadu(&A[j+2*PacketSize+iN])), -// internal::padd(internal::pmul(internal::pload(&B[j+3*PacketSize]), internal::ploadu(&A[j+3*PacketSize+iN])), -// internal::padd(internal::pmul(internal::pload(&B[j+4*PacketSize]), internal::ploadu(&A[j+4*PacketSize+iN])), -// internal::padd(internal::pmul(internal::pload(&B[j+5*PacketSize]), internal::ploadu(&A[j+5*PacketSize+iN])), -// internal::padd(internal::pmul(internal::pload(&B[j+6*PacketSize]), internal::ploadu(&A[j+6*PacketSize+iN])), -// internal::padd(internal::pmul(internal::pload(&B[j+7*PacketSize]), internal::ploadu(&A[j+7*PacketSize+iN])), -// ptmp)))))))); -// } -// for (int j = ANP;j<AN;j+=PacketSize) -// ptmp = internal::padd(ptmp, internal::pmul(internal::pload(&B[j]), internal::ploadu(&A[j+iN]))); -// #else -// for (int j = 0;j<AN;j+=PacketSize) -// ptmp = internal::padd(ptmp, internal::pmul(internal::pload(&B[j]), internal::ploadu(&A[j+iN]))); -// #endif -// } -// tmp = internal::predux(ptmp); -// } -// // process remaining scalars -// for (int j=AN;j<N;j++) -// tmp += B[j] * A[j+iN]; -// X[i] = tmp; -// } -// } - - static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N){ - int AN = (N/PacketSize)*PacketSize; - if (AN>0) - { - Packet pcoef = internal::pset1(coef); - #ifdef PEELING - const int peelSize = 3; - int ANP = (AN/(peelSize*PacketSize))*peelSize*PacketSize; - float* X1 = X + PacketSize; - float* Y1 = Y + PacketSize; - float* X2 = X + 2*PacketSize; - float* Y2 = Y + 2*PacketSize; - Packet x0,x1,x2,y0,y1,y2; - for (int j = 0;j<ANP;j+=PacketSize*peelSize) - { - x0 = internal::pload(X+j); - x1 = internal::pload(X1+j); - x2 = internal::pload(X2+j); - - y0 = internal::pload(Y+j); - y1 = internal::pload(Y1+j); - y2 = internal::pload(Y2+j); - - y0 = internal::pmadd(pcoef, x0, y0); - y1 = internal::pmadd(pcoef, x1, y1); - y2 = internal::pmadd(pcoef, x2, y2); - - internal::pstore(Y+j, y0); - internal::pstore(Y1+j, y1); - internal::pstore(Y2+j, y2); -// internal::pstore(&Y[j+2*PacketSize], internal::padd(internal::pload(&Y[j+2*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+2*PacketSize])))); -// internal::pstore(&Y[j+3*PacketSize], internal::padd(internal::pload(&Y[j+3*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+3*PacketSize])))); -// internal::pstore(&Y[j+4*PacketSize], internal::padd(internal::pload(&Y[j+4*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+4*PacketSize])))); -// internal::pstore(&Y[j+5*PacketSize], internal::padd(internal::pload(&Y[j+5*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+5*PacketSize])))); -// internal::pstore(&Y[j+6*PacketSize], internal::padd(internal::pload(&Y[j+6*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+6*PacketSize])))); -// internal::pstore(&Y[j+7*PacketSize], internal::padd(internal::pload(&Y[j+7*PacketSize]), internal::pmul(pcoef,internal::pload(&X[j+7*PacketSize])))); - } - for (int j = ANP;j<AN;j+=PacketSize) - internal::pstore(&Y[j], internal::padd(internal::pload(&Y[j]), internal::pmul(pcoef,internal::pload(&X[j])))); - #else - for (int j = 0;j<AN;j+=PacketSize) - internal::pstore(&Y[j], internal::padd(internal::pload(&Y[j]), internal::pmul(pcoef,internal::pload(&X[j])))); - #endif - } - // process remaining scalars - for (int i=AN;i<N;i++) - Y[i] += coef * X[i]; - } - - -}; - -#endif diff --git a/bench/btl/libs/hand_vec/main.cpp b/bench/btl/libs/hand_vec/main.cpp deleted file mode 100644 index 8bde4c89b..000000000 --- a/bench/btl/libs/hand_vec/main.cpp +++ /dev/null @@ -1,50 +0,0 @@ -//===================================================== -// File : main.cpp -// Author : L. Plagne <laurent.plagne@edf.fr)> -// Copyright (C) EDF R&D, lun sep 30 14:23:23 CEST 2002 -//===================================================== -// -// This program is free software; you can redistribute it and/or -// modify it under the terms of the GNU General Public License -// as published by the Free Software Foundation; either version 2 -// of the License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -// -#include "utilities.h" -#include "bench.hh" -#include "hand_vec_interface.hh" -#include "action_matrix_vector_product.hh" -#include "action_atv_product.hh" -#include "action_matrix_matrix_product.hh" -#include "action_axpy.hh" -#include "action_ata_product.hh" -#include "action_aat_product.hh" -#include "basic_actions.hh" -//#include "action_lu_solve.hh" -// #include "timers/mixed_perf_analyzer.hh" - -BTL_MAIN; - -int main() -{ - - bench<Action_matrix_vector_product<hand_vec_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); - bench<Action_atv_product<hand_vec_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT); -// bench<Action_matrix_matrix_product<hand_vec_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); -// bench<Action_aat_product<hand_vec_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); -// bench<Action_ata_product<hand_vec_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); - bench<Action_symv<hand_vec_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT); - bench<Action_axpy<hand_vec_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT); - - - return 0; -} - - |