diff options
Diffstat (limited to 'bench')
-rw-r--r-- | bench/benchCholesky.cpp | 16 | ||||
-rw-r--r-- | bench/btl/libs/blaze/CMakeLists.txt | 7 | ||||
-rw-r--r-- | bench/dense_solvers.cpp | 200 | ||||
-rw-r--r-- | bench/perf_monitoring/gemm/changesets.txt | 16 | ||||
-rw-r--r-- | bench/perf_monitoring/gemm/lazy_gemm.cpp | 9 | ||||
-rwxr-xr-x | bench/perf_monitoring/gemm/make_plot.sh | 2 | ||||
-rwxr-xr-x | bench/perf_monitoring/gemm/run.sh | 8 | ||||
-rw-r--r-- | bench/tensors/README | 13 | ||||
-rw-r--r-- | bench/tensors/contraction_benchmarks_cpu.cc | 39 | ||||
-rw-r--r-- | bench/tensors/tensor_benchmarks.h | 7 | ||||
-rw-r--r-- | bench/tensors/tensor_benchmarks_fp16_gpu.cu | 1 |
11 files changed, 243 insertions, 75 deletions
diff --git a/bench/benchCholesky.cpp b/bench/benchCholesky.cpp index 42b3e1285..9a8e7cf63 100644 --- a/bench/benchCholesky.cpp +++ b/bench/benchCholesky.cpp @@ -31,7 +31,7 @@ __attribute__ ((noinline)) void benchLLT(const MatrixType& m) int rows = m.rows(); int cols = m.cols(); - int cost = 0; + double cost = 0; for (int j=0; j<rows; ++j) { int r = std::max(rows - j -1,0); @@ -78,10 +78,10 @@ __attribute__ ((noinline)) void benchLLT(const MatrixType& m) else std::cout << "fixed "; std::cout << covMat.rows() << " \t" - << (timerNoSqrt.value() * REPEAT) / repeats << "s " - << "(" << 1e-6 * cost*repeats/timerNoSqrt.value() << " MFLOPS)\t" - << (timerSqrt.value() * REPEAT) / repeats << "s " - << "(" << 1e-6 * cost*repeats/timerSqrt.value() << " MFLOPS)\n"; + << (timerNoSqrt.best()) / repeats << "s " + << "(" << 1e-9 * cost*repeats/timerNoSqrt.best() << " GFLOPS)\t" + << (timerSqrt.best()) / repeats << "s " + << "(" << 1e-9 * cost*repeats/timerSqrt.best() << " GFLOPS)\n"; #ifdef BENCH_GSL @@ -119,13 +119,13 @@ __attribute__ ((noinline)) void benchLLT(const MatrixType& m) int main(int argc, char* argv[]) { - const int dynsizes[] = {4,6,8,16,24,32,49,64,128,256,512,900,0}; - std::cout << "size no sqrt standard"; + const int dynsizes[] = {4,6,8,16,24,32,49,64,128,256,512,900,1500,0}; + std::cout << "size LDLT LLT"; // #ifdef BENCH_GSL // std::cout << " GSL (standard + double + ATLAS) "; // #endif std::cout << "\n"; - for (uint i=0; dynsizes[i]>0; ++i) + for (int i=0; dynsizes[i]>0; ++i) benchLLT(Matrix<Scalar,Dynamic,Dynamic>(dynsizes[i],dynsizes[i])); benchLLT(Matrix<Scalar,2,2>()); diff --git a/bench/btl/libs/blaze/CMakeLists.txt b/bench/btl/libs/blaze/CMakeLists.txt index f8b1b2ec3..e99a0855c 100644 --- a/bench/btl/libs/blaze/CMakeLists.txt +++ b/bench/btl/libs/blaze/CMakeLists.txt @@ -1,10 +1,13 @@ find_package(BLAZE) -find_package(Boost) +find_package(Boost COMPONENTS system) if (BLAZE_FOUND AND Boost_FOUND) include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS}) btl_add_bench(btl_blaze main.cpp) + # Note: The newest blaze version requires C++14. + # Ideally, we should set this depending on the version of Blaze we found + set_property(TARGET btl_blaze PROPERTY CXX_STANDARD 14) if(BUILD_btl_blaze) - target_link_libraries(btl_blaze ${Boost_LIBRARIES} ${Boost_system_LIBRARY} /opt/local/lib/libboost_system-mt.a ) + target_link_libraries(btl_blaze ${Boost_LIBRARIES}) endif() endif () diff --git a/bench/dense_solvers.cpp b/bench/dense_solvers.cpp index aa4ff011f..24343dcd8 100644 --- a/bench/dense_solvers.cpp +++ b/bench/dense_solvers.cpp @@ -2,47 +2,74 @@ #include "BenchTimer.h" #include <Eigen/Dense> #include <map> +#include <vector> #include <string> +#include <sstream> using namespace Eigen; -std::map<std::string,Array<float,1,4> > results; +std::map<std::string,Array<float,1,8,DontAlign|RowMajor> > results; +std::vector<std::string> labels; +std::vector<Array2i> sizes; + +template<typename Solver,typename MatrixType> +EIGEN_DONT_INLINE +void compute_norm_equation(Solver &solver, const MatrixType &A) { + if(A.rows()!=A.cols()) + solver.compute(A.transpose()*A); + else + solver.compute(A); +} + +template<typename Solver,typename MatrixType> +EIGEN_DONT_INLINE +void compute(Solver &solver, const MatrixType &A) { + solver.compute(A); +} template<typename Scalar,int Size> -void bench(int id, int size = Size) +void bench(int id, int rows, int size = Size) { - typedef Matrix<Scalar,Size,Size> Mat; - Mat A(size,size); + typedef Matrix<Scalar,Dynamic,Size> Mat; + typedef Matrix<Scalar,Dynamic,Dynamic> MatDyn; + typedef Matrix<Scalar,Size,Size> MatSquare; + Mat A(rows,size); A.setRandom(); - A = A*A.adjoint(); + if(rows==size) + A = A*A.adjoint(); BenchTimer t_llt, t_ldlt, t_lu, t_fplu, t_qr, t_cpqr, t_cod, t_fpqr, t_jsvd, t_bdcsvd; + + int svd_opt = ComputeThinU|ComputeThinV; - int tries = 3; + int tries = 5; int rep = 1000/size; if(rep==0) rep = 1; // rep = rep*rep; - LLT<Mat> llt(A); - LDLT<Mat> ldlt(A); - PartialPivLU<Mat> lu(A); - FullPivLU<Mat> fplu(A); - HouseholderQR<Mat> qr(A); - ColPivHouseholderQR<Mat> cpqr(A); - CompleteOrthogonalDecomposition<Mat> cod(A); - FullPivHouseholderQR<Mat> fpqr(A); - JacobiSVD<Mat> jsvd(A.rows(),A.cols()); - BDCSVD<Mat> bdcsvd(A.rows(),A.cols()); + LLT<MatSquare> llt(size); + LDLT<MatSquare> ldlt(size); + PartialPivLU<MatSquare> lu(size); + FullPivLU<MatSquare> fplu(size,size); + HouseholderQR<Mat> qr(A.rows(),A.cols()); + ColPivHouseholderQR<Mat> cpqr(A.rows(),A.cols()); + CompleteOrthogonalDecomposition<Mat> cod(A.rows(),A.cols()); + FullPivHouseholderQR<Mat> fpqr(A.rows(),A.cols()); + JacobiSVD<MatDyn> jsvd(A.rows(),A.cols()); + BDCSVD<MatDyn> bdcsvd(A.rows(),A.cols()); - BENCH(t_llt, tries, rep, llt.compute(A)); - BENCH(t_ldlt, tries, rep, ldlt.compute(A)); - BENCH(t_lu, tries, rep, lu.compute(A)); - BENCH(t_fplu, tries, rep, fplu.compute(A)); - BENCH(t_qr, tries, rep, qr.compute(A)); - BENCH(t_cpqr, tries, rep, cpqr.compute(A)); - BENCH(t_cod, tries, rep, cod.compute(A)); - BENCH(t_fpqr, tries, rep, fpqr.compute(A)); + BENCH(t_llt, tries, rep, compute_norm_equation(llt,A)); + BENCH(t_ldlt, tries, rep, compute_norm_equation(ldlt,A)); + BENCH(t_lu, tries, rep, compute_norm_equation(lu,A)); + if(size<=1000) + BENCH(t_fplu, tries, rep, compute_norm_equation(fplu,A)); + BENCH(t_qr, tries, rep, compute(qr,A)); + BENCH(t_cpqr, tries, rep, compute(cpqr,A)); + BENCH(t_cod, tries, rep, compute(cod,A)); + if(size*rows<=10000000) + BENCH(t_fpqr, tries, rep, compute(fpqr,A)); if(size<500) // JacobiSVD is really too slow for too large matrices - BENCH(t_jsvd, tries, rep, jsvd.compute(A,ComputeFullU|ComputeFullV)); - BENCH(t_bdcsvd, tries, rep, bdcsvd.compute(A,ComputeFullU|ComputeFullV)); + BENCH(t_jsvd, tries, rep, jsvd.compute(A,svd_opt)); +// if(size*rows<=20000000) + BENCH(t_bdcsvd, tries, rep, bdcsvd.compute(A,svd_opt)); results["LLT"][id] = t_llt.best(); results["LDLT"][id] = t_ldlt.best(); @@ -52,33 +79,108 @@ void bench(int id, int size = Size) results["ColPivHouseholderQR"][id] = t_cpqr.best(); results["CompleteOrthogonalDecomposition"][id] = t_cod.best(); results["FullPivHouseholderQR"][id] = t_fpqr.best(); - results["JacobiSVD"][id] = size<500 ? t_jsvd.best() : 0; + results["JacobiSVD"][id] = t_jsvd.best(); results["BDCSVD"][id] = t_bdcsvd.best(); } + int main() { + labels.push_back("LLT"); + labels.push_back("LDLT"); + labels.push_back("PartialPivLU"); + labels.push_back("FullPivLU"); + labels.push_back("HouseholderQR"); + labels.push_back("ColPivHouseholderQR"); + labels.push_back("CompleteOrthogonalDecomposition"); + labels.push_back("FullPivHouseholderQR"); + labels.push_back("JacobiSVD"); + labels.push_back("BDCSVD"); + + for(int i=0; i<labels.size(); ++i) + results[labels[i]].fill(-1); + const int small = 8; - const int medium = 100; - const int large = 1000; - const int xl = 4000; - - bench<float,small>(0); - bench<float,Dynamic>(1,medium); - bench<float,Dynamic>(2,large); - bench<float,Dynamic>(3,xl); - - IOFormat fmt(3, 0, " \t", "\n", "", ""); - - std::cout << "solver/size " << small << "\t" << medium << "\t" << large << "\t" << xl << "\n"; - std::cout << "LLT (ms) " << (results["LLT"]/1000.).format(fmt) << "\n"; - std::cout << "LDLT (%) " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "PartialPivLU (%) " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "FullPivLU (%) " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "HouseholderQR (%) " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "ColPivHouseholderQR (%) " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "CompleteOrthogonalDecomposition (%) " << (results["CompleteOrthogonalDecomposition"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "FullPivHouseholderQR (%) " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "JacobiSVD (%) " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n"; - std::cout << "BDCSVD (%) " << (results["BDCSVD"]/results["LLT"]).format(fmt) << "\n"; + sizes.push_back(Array2i(small,small)); + sizes.push_back(Array2i(100,100)); + sizes.push_back(Array2i(1000,1000)); + sizes.push_back(Array2i(4000,4000)); + sizes.push_back(Array2i(10000,small)); + sizes.push_back(Array2i(10000,100)); + sizes.push_back(Array2i(10000,1000)); + sizes.push_back(Array2i(10000,4000)); + + using namespace std; + + for(int k=0; k<sizes.size(); ++k) + { + cout << sizes[k](0) << "x" << sizes[k](1) << "...\n"; + bench<float,Dynamic>(k,sizes[k](0),sizes[k](1)); + } + + cout.width(32); + cout << "solver/size"; + cout << " "; + for(int k=0; k<sizes.size(); ++k) + { + std::stringstream ss; + ss << sizes[k](0) << "x" << sizes[k](1); + cout.width(10); cout << ss.str(); cout << " "; + } + cout << endl; + + + for(int i=0; i<labels.size(); ++i) + { + cout.width(32); cout << labels[i]; cout << " "; + ArrayXf r = (results[labels[i]]*100000.f).floor()/100.f; + for(int k=0; k<sizes.size(); ++k) + { + cout.width(10); + if(r(k)>=1e6) cout << "-"; + else cout << r(k); + cout << " "; + } + cout << endl; + } + + // HTML output + cout << "<table class=\"manual\">" << endl; + cout << "<tr><th>solver/size</th>" << endl; + for(int k=0; k<sizes.size(); ++k) + cout << " <th>" << sizes[k](0) << "x" << sizes[k](1) << "</th>"; + cout << "</tr>" << endl; + for(int i=0; i<labels.size(); ++i) + { + cout << "<tr"; + if(i%2==1) cout << " class=\"alt\""; + cout << "><td>" << labels[i] << "</td>"; + ArrayXf r = (results[labels[i]]*100000.f).floor()/100.f; + for(int k=0; k<sizes.size(); ++k) + { + if(r(k)>=1e6) cout << "<td>-</td>"; + else + { + cout << "<td>" << r(k); + if(i>0) + cout << " (x" << numext::round(10.f*results[labels[i]](k)/results["LLT"](k))/10.f << ")"; + if(i<4 && sizes[k](0)!=sizes[k](1)) + cout << " <sup><a href=\"#note_ls\">*</a></sup>"; + cout << "</td>"; + } + } + cout << "</tr>" << endl; + } + cout << "</table>" << endl; + +// cout << "LLT (ms) " << (results["LLT"]*1000.).format(fmt) << "\n"; +// cout << "LDLT (%) " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n"; +// cout << "PartialPivLU (%) " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n"; +// cout << "FullPivLU (%) " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n"; +// cout << "HouseholderQR (%) " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n"; +// cout << "ColPivHouseholderQR (%) " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; +// cout << "CompleteOrthogonalDecomposition (%) " << (results["CompleteOrthogonalDecomposition"]/results["LLT"]).format(fmt) << "\n"; +// cout << "FullPivHouseholderQR (%) " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n"; +// cout << "JacobiSVD (%) " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n"; +// cout << "BDCSVD (%) " << (results["BDCSVD"]/results["LLT"]).format(fmt) << "\n"; } diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt index fb3e48e99..af8eb9b8f 100644 --- a/bench/perf_monitoring/gemm/changesets.txt +++ b/bench/perf_monitoring/gemm/changesets.txt @@ -42,6 +42,20 @@ before-evaluators 6984:45f26866c091 # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache 6986:a675d05b6f8f # blocking heuristic: block on the rhs in L1 if the lhs fit in L1. 7013:f875e75f07e5 # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5) +7015:8aad8f35c955 # Refactor computeProductBlockingSizes to make room for the possibility of using lookup tables +7016:a58d253e8c91 # Polish lookup tables generation +7018:9b27294a8186 # actual_panel_rows computation should always be resilient to parameters not consistent with the known L1 cache size, see comment +7019:c758b1e2c073 # Provide a empirical lookup table for blocking sizes measured on a Nexus 5. Only for float, only for Android on ARM 32bit for now. +7085:627e039fba68 # Bug 986: add support for coefficient-based product with 0 depth. +7098:b6f1db9cf9ec # Bug 992: don't select a 3p GEMM path with non-vectorizable scalar types, this hits unsupported paths in symm/triangular products code 7591:09a8e2186610 # 3.3-alpha1 7650:b0f3c8f43025 # help clang inlining - +#8744:74b789ada92a # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs) +8789:efcb912e4356 # Made the index type a template parameter to evaluateProductBlockingSizes. Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes +8972:81d53c711775 # Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path +8985:d935df21a082 # Remove the rotating kernel. +8988:6c2dc56e73b3 # Bug 256: enable vectorization with unaligned loads/stores. +9148:b8b8c421e36c # Relax mixing-type constraints for binary coefficient-wise operators +9174:d228bc282ac9 # merge +9212:c90098affa7b # Fix performance regression introduced in changeset 8aad8f35c955 +9213:9f1c14e4694b # Fix performance regression in dgemm introduced by changeset 81d53c711775 diff --git a/bench/perf_monitoring/gemm/lazy_gemm.cpp b/bench/perf_monitoring/gemm/lazy_gemm.cpp index b443218d7..6dc370155 100644 --- a/bench/perf_monitoring/gemm/lazy_gemm.cpp +++ b/bench/perf_monitoring/gemm/lazy_gemm.cpp @@ -12,12 +12,13 @@ using namespace Eigen; typedef SCALAR Scalar; template<typename MatA, typename MatB, typename MatC> -inline void lazy_gemm(const MatA &A, const MatB &B, MatC &C) +EIGEN_DONT_INLINE +void lazy_gemm(const MatA &A, const MatB &B, MatC &C) { - escape((void*)A.data()); - escape((void*)B.data()); +// escape((void*)A.data()); +// escape((void*)B.data()); C.noalias() += A.lazyProduct(B); - escape((void*)C.data()); +// escape((void*)C.data()); } template<int m, int n, int k, int TA> diff --git a/bench/perf_monitoring/gemm/make_plot.sh b/bench/perf_monitoring/gemm/make_plot.sh index 4d6053501..cd3214ac9 100755 --- a/bench/perf_monitoring/gemm/make_plot.sh +++ b/bench/perf_monitoring/gemm/make_plot.sh @@ -25,7 +25,7 @@ echo "set xtics rotate 1" >> $WHAT.gnuplot echo "set term pdf color rounded enhanced fontscale 0.35 size 7in,5in" >> $WHAT.gnuplot echo set output "'"$WHAT.pdf"'" >> $WHAT.gnuplot -col=`cat settings.txt | wc -l` +col=`cat $bench"_settings.txt" | wc -l` echo "plot for [col=2:$col+1] '$WHAT.out.header' using 0:col:xticlabels(1) with lines" >> $WHAT.gnuplot echo " " >> $WHAT.gnuplot diff --git a/bench/perf_monitoring/gemm/run.sh b/bench/perf_monitoring/gemm/run.sh index bfb4ecfac..9d6ee40bc 100755 --- a/bench/perf_monitoring/gemm/run.sh +++ b/bench/perf_monitoring/gemm/run.sh @@ -138,15 +138,15 @@ do done echo "Float:" -cat $PREFIX"s"$bench.out" -echo "" +cat $PREFIX"s""$bench.out" +echo " " echo "Double:" -cat $PREFIX"d"$bench.out" +cat $PREFIX"d""$bench.out" echo "" echo "Complex:" -cat $PREFIX"c"$bench.out" +cat $PREFIX"c""$bench.out" echo "" ./make_plot.sh $PREFIX"s"$bench $bench diff --git a/bench/tensors/README b/bench/tensors/README index 4398aa81b..803cb8ef8 100644 --- a/bench/tensors/README +++ b/bench/tensors/README @@ -1,12 +1,15 @@ -Each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU. +The tensor benchmark suite is made of several parts. + +The first part is a generic suite, in which each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU. To compile the floating point CPU benchmarks, simply call: g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu To compile the floating point GPU benchmarks, simply call: -nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_35 -o benchmarks_gpu - +nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_35 -o benchmarks_gpu -To compile the half float GPU benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code. -nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_53 -o benchmarks_fp16_gpu +We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code. +nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_53 -o benchmarks_fp16_gpu +last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call +g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu diff --git a/bench/tensors/contraction_benchmarks_cpu.cc b/bench/tensors/contraction_benchmarks_cpu.cc new file mode 100644 index 000000000..f9e57ad47 --- /dev/null +++ b/bench/tensors/contraction_benchmarks_cpu.cc @@ -0,0 +1,39 @@ +#define EIGEN_USE_THREADS + +#include <string> + +#include "tensor_benchmarks.h" + +#define CREATE_THREAD_POOL(threads) \ +Eigen::ThreadPool pool(threads); \ +Eigen::ThreadPoolDevice device(&pool, threads); + + +// Contractions for number of threads ranging from 1 to 32 +// Dimensions are Rows, Cols, Depth +#define BM_ContractionCPU(D1, D2, D3) \ + static void BM_##Contraction##_##D1##x##D2##x##D3(int iters, int Threads) { \ + StopBenchmarkTiming(); \ + CREATE_THREAD_POOL(Threads); \ + BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \ + suite.contraction(iters); \ + } \ + BENCHMARK_RANGE(BM_##Contraction##_##D1##x##D2##x##D3, 1, 32); + + +// Vector Matrix and Matrix Vector products +BM_ContractionCPU(1, 2000, 500); +BM_ContractionCPU(2000, 1, 500); + +// Various skinny matrices +BM_ContractionCPU(250, 3, 512); +BM_ContractionCPU(1500, 3, 512); + +BM_ContractionCPU(512, 800, 4); +BM_ContractionCPU(512, 80, 800); +BM_ContractionCPU(512, 80, 13522); +BM_ContractionCPU(1, 80, 13522); + +BM_ContractionCPU(3200, 512, 4); +BM_ContractionCPU(3200, 512, 80); +BM_ContractionCPU(3200, 80, 512); diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 62533a608..c2fb3dede 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -178,9 +178,14 @@ template <typename Device, typename T> class BenchmarkSuite { size_b[1] = m_; TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b); +#if defined(EIGEN_HAS_INDEX_LIST) + Eigen::IndexPairList<Eigen::type2indexpair<0, 0>, + Eigen::type2indexpair<2, 1> > paddings; +#else Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings; paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0); paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1); +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -368,7 +373,7 @@ template <typename Device, typename T> class BenchmarkSuite { const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B( b_, input_size); Eigen::array<TensorIndex, 0> output_size; - TensorMap<Tensor<float, 0, 0, TensorIndex>, Eigen::Aligned> C( + TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C( c_, output_size); StartBenchmarkTiming(); diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu index 14876556e..65784d0d6 100644 --- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu +++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu @@ -33,6 +33,7 @@ BM_FuncGPU(algebraicFunc); BM_FuncGPU(transcendentalFunc); BM_FuncGPU(rowReduction); BM_FuncGPU(colReduction); +BM_FuncGPU(fullReduction); // Contractions |