diff options
author | Gael Guennebaud <g.gael@free.fr> | 2010-02-26 14:57:49 +0100 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2010-02-26 14:57:49 +0100 |
commit | ac425090f389e34f9aee71b5957cca529ac74a38 (patch) | |
tree | 9fc044c93118431a53470ea9d1a6ba90a360e162 | |
parent | 8d4a0e6753117cbabf80e4b6fa13d3d3b6ba0327 (diff) |
BTL: allow to bench real time
-rw-r--r-- | bench/bench_gemm.cpp | 5 | ||||
-rw-r--r-- | bench/bench_gemm_blas.cpp | 109 | ||||
-rw-r--r-- | bench/btl/generic_bench/bench_parameter.hh | 2 | ||||
-rw-r--r-- | bench/btl/generic_bench/btl.hh | 21 | ||||
-rw-r--r-- | bench/btl/generic_bench/timers/portable_perf_analyzer.hh | 2 | ||||
-rwxr-xr-x | bench/btl/generic_bench/timers/portable_timer.hh | 44 |
6 files changed, 142 insertions, 41 deletions
diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp index d958cc1bf..c7a3db619 100644 --- a/bench/bench_gemm.cpp +++ b/bench/bench_gemm.cpp @@ -3,6 +3,7 @@ // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp && OMP_NUM_THREADS=2 ./a.out #include <Eigen/Core> + #include <bench/BenchTimer.h> using namespace std; @@ -68,10 +69,10 @@ void gemm(const M& a, const M& b, M& c) int main(int argc, char ** argv) { - int rep = 1; // number of repetitions per try + int rep = 2048; // number of repetitions per try int tries = 5; // number of tries, we keep the best - int s = 2048; + int s = 512; int m = s; int n = s; int p = s; diff --git a/bench/bench_gemm_blas.cpp b/bench/bench_gemm_blas.cpp new file mode 100644 index 000000000..254302312 --- /dev/null +++ b/bench/bench_gemm_blas.cpp @@ -0,0 +1,109 @@ + +#include <Eigen/Core> +#include <bench/BenchTimer.h> + +extern "C" +{ + #include <bench/btl/libs/C_BLAS/blas.h> + #include <cblas.h> + + void sgemm_kernel(int actual_mc, int cols, int actual_kc, float alpha, + float* blockA, float* blockB, float* res, int resStride); + + void sgemm_otcopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB); + void sgemm_oncopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB); + void sgemm_itcopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB); + void sgemm_incopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB); +} + +using namespace std; +using namespace Eigen; + +#ifndef SCALAR +#define SCALAR float +#endif + +typedef SCALAR Scalar; +typedef Matrix<Scalar,Dynamic,Dynamic> M; + +static float fone = 1; +static float fzero = 0; +static double done = 1; +static double szero = 0; +static char notrans = 'N'; +static char trans = 'T'; +static char nonunit = 'N'; +static char lower = 'L'; +static char right = 'R'; +static int intone = 1; + +void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c) +{ + int M = c.rows(); + int N = c.cols(); + int K = a.cols(); + + int lda = a.rows(); + int ldb = b.rows(); + int ldc = c.rows(); + +// c.noalias() += a * b; + sgemm_(¬rans,¬rans,&M,&N,&K,&fone, + const_cast<float*>(a.data()),&lda, + const_cast<float*>(b.data()),&ldb,&fone, + c.data(),&ldc); +} + +void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c) +{ + int M = c.rows(); + int N = c.cols(); + int K = a.cols(); + + int lda = a.rows(); + int ldb = b.rows(); + int ldc = c.rows(); + +// c.noalias() += a * b; + + dgemm_(¬rans,¬rans,&M,&N,&K,&done, + const_cast<double*>(a.data()),&lda, + const_cast<double*>(b.data()),&ldb,&done, + c.data(),&ldc); +} + +int main(int argc, char **argv) +{ + int rep = 1; + int s = 2048; + int m = s; + int n = s; + int p = s; + const int N = 1; + M a[N]; + M b[N]; + M c[N]; + + for (int k=0; k<N; ++k) + { + a[k].resize(m,p); a[k].setOnes(); + b[k].resize(p,n); b[k].setOnes(); + c[k].resize(m,n); c[k].setOnes(); + } + + BenchTimer t; + + BENCH(t, 5, rep, + for(int k=0;k<N;++k) + blas_gemm(a[k],b[k],c[k])); + +// BENCH(t, 5, rep, +// _Pragma("omp parallel for schedule(static,1)") +// for(int k=0;k<N;++k) +// blas_gemm(a[k],b[k],c[k])); + + std::cerr << "cpu " << t.best(CPU_TIMER)/rep << "s \t" << (double(m)*N*n*p*rep*2/t.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER) << "s)\n"; + std::cerr << "real " << t.best(REAL_TIMER)/rep << "s \t" << (double(m)*N*n*p*rep*2/t.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n"; + return 0; +} + diff --git a/bench/btl/generic_bench/bench_parameter.hh b/bench/btl/generic_bench/bench_parameter.hh index d14340037..4c355cd6e 100644 --- a/bench/btl/generic_bench/bench_parameter.hh +++ b/bench/btl/generic_bench/bench_parameter.hh @@ -48,6 +48,6 @@ #define DEFAULT_NB_SAMPLE 1000 // how many times we run a single bench (keep the best perf) -#define NB_TRIES 3 +#define DEFAULT_NB_TRIES 3 #endif diff --git a/bench/btl/generic_bench/btl.hh b/bench/btl/generic_bench/btl.hh index fdc099296..fd501c313 100644 --- a/bench/btl/generic_bench/btl.hh +++ b/bench/btl/generic_bench/btl.hh @@ -41,7 +41,7 @@ #if (defined __GNUC__) #define BTL_ASM_COMMENT(X) asm("#"X) #else -#define BTL_ASM_COMMENT(X) +#define BTL_ASM_COMMENT(X) #endif #if (defined __GNUC__) && (!defined __INTEL_COMPILER) @@ -169,7 +169,7 @@ class BtlConfig { public: BtlConfig() - : overwriteResults(false), checkResults(true) + : overwriteResults(false), checkResults(true), realclock(false), tries(DEFAULT_NB_TRIES) { char * _config; _config = getenv ("BTL_CONFIG"); @@ -189,6 +189,17 @@ public: i += 1; } + else if (config[i].beginsWith("-t")) + { + if (i+1==config.size()) + { + std::cerr << "error processing option: " << config[i] << "\n"; + exit(2); + } + Instance.tries = atoi(config[i+1].c_str()); + + i += 1; + } else if (config[i].beginsWith("--overwrite")) { Instance.overwriteResults = true; @@ -197,6 +208,10 @@ public: { Instance.checkResults = false; } + else if (config[i].beginsWith("--real")) + { + Instance.realclock = true; + } } } @@ -219,6 +234,8 @@ public: static BtlConfig Instance; bool overwriteResults; bool checkResults; + bool realclock; + int tries; protected: std::vector<BtlString> m_selectedActionNames; diff --git a/bench/btl/generic_bench/timers/portable_perf_analyzer.hh b/bench/btl/generic_bench/timers/portable_perf_analyzer.hh index 6b1f8e7d7..5c337471e 100644 --- a/bench/btl/generic_bench/timers/portable_perf_analyzer.hh +++ b/bench/btl/generic_bench/timers/portable_perf_analyzer.hh @@ -53,7 +53,7 @@ public: } // optimize - for (int i=1; i<NB_TRIES; ++i) + for (int i=1; i<BtlConfig::Instance.tries; ++i) { Action _action(size); std::cout << " " << _action.nb_op_base()*_nb_calc/(m_time_action*1e6) << " "; diff --git a/bench/btl/generic_bench/timers/portable_timer.hh b/bench/btl/generic_bench/timers/portable_timer.hh index 42528d113..e6ad309fe 100755 --- a/bench/btl/generic_bench/timers/portable_timer.hh +++ b/bench/btl/generic_bench/timers/portable_timer.hh @@ -98,70 +98,44 @@ class Portable_Timer { public: - Portable_Timer( void ) -// :_utime_sec_start(-1), -// _utime_usec_start(-1), -// _utime_sec_stop(-1), -// _utime_usec_stop(-1)/*, -// m_prev_cs(-1)*/ + Portable_Timer() { + m_clkid = BtlConfig::Instance.realclock ? CLOCK_REALTIME : CLOCK_PROCESS_CPUTIME_ID; } + Portable_Timer(int clkid) : m_clkid(clkid) + {} - void start() + void start() { -// int status=getrusage(RUSAGE_SELF, &resourcesUsage) ; -// _utime_sec_start = resourcesUsage.ru_utime.tv_sec ; -// _utime_usec_start = resourcesUsage.ru_utime.tv_usec ; - timespec ts; - clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + clock_gettime(m_clkid, &ts); m_start_time = double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec); } void stop() { -// int status=getrusage(RUSAGE_SELF, &resourcesUsage) ; -// _utime_sec_stop = resourcesUsage.ru_utime.tv_sec ; -// _utime_usec_stop = resourcesUsage.ru_utime.tv_usec ; - timespec ts; - clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + clock_gettime(m_clkid, &ts); m_stop_time = double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec); } double elapsed() { - return user_time();//double(_stop_time - _start_time) / CLOCKS_PER_SEC; + return user_time(); } double user_time() { -// std::cout << m_prev_cs << "\n"; -// long tot_utime_sec=_utime_sec_stop-_utime_sec_start; -// long tot_utime_usec=_utime_usec_stop-_utime_usec_start; -// return double(tot_utime_sec)+ double(tot_utime_usec)/double(USEC_IN_SEC) ; return m_stop_time - m_start_time; } private: -// struct rusage resourcesUsage ; - -// long _utime_sec_start ; -// long _utime_usec_start ; - -// long _utime_sec_stop ; -// long _utime_usec_stop ; - -// long m_prev_cs; - -// std::clock_t _start_time; -// std::clock_t _stop_time; - + int m_clkid; double m_stop_time, m_start_time; }; // Portable_Timer |