diff options
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 11 | ||||
-rw-r--r-- | Eigen/src/Core/products/Parallelizer.h | 11 | ||||
-rw-r--r-- | bench/bench_gemm.cpp | 2 |
3 files changed, 12 insertions, 12 deletions
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index da700f8b7..cf42855eb 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -98,7 +98,6 @@ static void run(int rows, int cols, int depth, // if you have the GOTO blas library you can try our parallelization strategy // using GOTO's optimized routines. -// #define USEGOTOROUTINES #ifdef USEGOTOROUTINES void* u = alloca(4096+sizeW); #endif @@ -125,7 +124,8 @@ static void run(int rows, int cols, int depth, // However, before copying to B'_j, we have to make sure that no other thread is still using it, // i.e., we test that info[tid].users equals 0. // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. - while(!info[tid].users.testAndSetOrdered(0,threads)) {} + while(info[tid].users!=0) {} + info[tid].users += threads; #ifndef USEGOTOROUTINES pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, alpha, actual_kc, info[tid].rhs_length); @@ -134,7 +134,7 @@ static void run(int rows, int cols, int depth, #endif // Notify the other threads that the part B'_j is ready to go. - info[tid].sync.fetchAndStoreOrdered(k); + info[tid].sync = k; // Computes C_i += A' * B' per B'_j for(int shift=0; shift<threads; ++shift) @@ -145,7 +145,7 @@ static void run(int rows, int cols, int depth, // we use testAndSetOrdered to mimic a volatile access. // However, no need to wait for the B' part which has been updated by the current thread! if(shift>0) - while(!info[j].sync.testAndSetOrdered(k,k)) {} + while(info[j].sync!=k) {} #ifndef USEGOTOROUTINES gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*kc, mc, actual_kc, info[j].rhs_length, -1,-1,0,0, w); @@ -178,7 +178,8 @@ static void run(int rows, int cols, int depth, // Release all the sub blocks B'_j of B' for the current thread, // i.e., we simply decrement the number of users by 1 for(int j=0; j<threads; ++j) - info[j].users.fetchAndAddOrdered(-1); + #pragma omp atomic + --(info[j].users); } ei_aligned_stack_delete(Scalar, blockA, kc*mc); diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 620a5e8ba..404b8d390 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -92,8 +92,11 @@ void ei_run_parallel_2d(const Functor& func, int size1, int size2) struct GemmParallelInfo { - QAtomicInt sync; - QAtomicInt users; + GemmParallelInfo() : sync(-1), users(0) {} + + int volatile sync; + int volatile users; + int rhs_start; int rhs_length; float* blockB; @@ -118,7 +121,7 @@ void ei_run_parallel_gemm(const Functor& func, int rows, int cols) GemmParallelInfo* info = new GemmParallelInfo[threads]; - #pragma omp parallel for schedule(static,1) + #pragma omp parallel for schedule(static,1) shared(info) for(int i=0; i<threads; ++i) { int r0 = i*blockRows; @@ -130,8 +133,6 @@ void ei_run_parallel_gemm(const Functor& func, int rows, int cols) info[i].rhs_start = c0; info[i].rhs_length = actualBlockCols; info[i].blockB = sharedBlockB; - info[i].sync.fetchAndStoreOrdered(-1); - info[i].users.fetchAndStoreOrdered(0); func(r0, actualBlockRows, 0,cols, info); } diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp index 12df7bcbc..653a880a8 100644 --- a/bench/bench_gemm.cpp +++ b/bench/bench_gemm.cpp @@ -2,8 +2,6 @@ // g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2 ./a.out // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp && OMP_NUM_THREADS=2 ./a.out -#include <QAtomicInt> - #include <Eigen/Core> #include <bench/BenchTimer.h> |