diff options
Diffstat (limited to 'bench/tensors')
-rw-r--r-- | bench/tensors/README | 13 | ||||
-rw-r--r-- | bench/tensors/contraction_benchmarks_cpu.cc | 39 | ||||
-rw-r--r-- | bench/tensors/tensor_benchmarks.h | 7 | ||||
-rw-r--r-- | bench/tensors/tensor_benchmarks_fp16_gpu.cu | 1 |
4 files changed, 54 insertions, 6 deletions
diff --git a/bench/tensors/README b/bench/tensors/README index 4398aa81b..803cb8ef8 100644 --- a/bench/tensors/README +++ b/bench/tensors/README @@ -1,12 +1,15 @@ -Each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU. +The tensor benchmark suite is made of several parts. + +The first part is a generic suite, in which each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU. To compile the floating point CPU benchmarks, simply call: g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu To compile the floating point GPU benchmarks, simply call: -nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_35 -o benchmarks_gpu - +nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_35 -o benchmarks_gpu -To compile the half float GPU benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code. -nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_53 -o benchmarks_fp16_gpu +We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code. +nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_53 -o benchmarks_fp16_gpu +last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call +g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu diff --git a/bench/tensors/contraction_benchmarks_cpu.cc b/bench/tensors/contraction_benchmarks_cpu.cc new file mode 100644 index 000000000..f9e57ad47 --- /dev/null +++ b/bench/tensors/contraction_benchmarks_cpu.cc @@ -0,0 +1,39 @@ +#define EIGEN_USE_THREADS + +#include <string> + +#include "tensor_benchmarks.h" + +#define CREATE_THREAD_POOL(threads) \ +Eigen::ThreadPool pool(threads); \ +Eigen::ThreadPoolDevice device(&pool, threads); + + +// Contractions for number of threads ranging from 1 to 32 +// Dimensions are Rows, Cols, Depth +#define BM_ContractionCPU(D1, D2, D3) \ + static void BM_##Contraction##_##D1##x##D2##x##D3(int iters, int Threads) { \ + StopBenchmarkTiming(); \ + CREATE_THREAD_POOL(Threads); \ + BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \ + suite.contraction(iters); \ + } \ + BENCHMARK_RANGE(BM_##Contraction##_##D1##x##D2##x##D3, 1, 32); + + +// Vector Matrix and Matrix Vector products +BM_ContractionCPU(1, 2000, 500); +BM_ContractionCPU(2000, 1, 500); + +// Various skinny matrices +BM_ContractionCPU(250, 3, 512); +BM_ContractionCPU(1500, 3, 512); + +BM_ContractionCPU(512, 800, 4); +BM_ContractionCPU(512, 80, 800); +BM_ContractionCPU(512, 80, 13522); +BM_ContractionCPU(1, 80, 13522); + +BM_ContractionCPU(3200, 512, 4); +BM_ContractionCPU(3200, 512, 80); +BM_ContractionCPU(3200, 80, 512); diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 62533a608..c2fb3dede 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -178,9 +178,14 @@ template <typename Device, typename T> class BenchmarkSuite { size_b[1] = m_; TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b); +#if defined(EIGEN_HAS_INDEX_LIST) + Eigen::IndexPairList<Eigen::type2indexpair<0, 0>, + Eigen::type2indexpair<2, 1> > paddings; +#else Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings; paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0); paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1); +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -368,7 +373,7 @@ template <typename Device, typename T> class BenchmarkSuite { const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B( b_, input_size); Eigen::array<TensorIndex, 0> output_size; - TensorMap<Tensor<float, 0, 0, TensorIndex>, Eigen::Aligned> C( + TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C( c_, output_size); StartBenchmarkTiming(); diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu index 14876556e..65784d0d6 100644 --- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu +++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu @@ -33,6 +33,7 @@ BM_FuncGPU(algebraicFunc); BM_FuncGPU(transcendentalFunc); BM_FuncGPU(rowReduction); BM_FuncGPU(colReduction); +BM_FuncGPU(fullReduction); // Contractions |