From c8d5f21941a41556f94e937ea5a91badb7fb9353 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 16:20:36 -0800 Subject: Added extra tensor benchmarks --- bench/tensors/tensor_benchmarks.h | 87 +++++++++++++++++++++++++++++++--- bench/tensors/tensor_benchmarks_cpu.cc | 28 +++++++++-- bench/tensors/tensor_benchmarks_gpu.cu | 7 ++- 3 files changed, 111 insertions(+), 11 deletions(-) (limited to 'bench') diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 071326aa7..6b9d13446 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -45,6 +45,20 @@ template class BenchmarkSuite { finalizeBenchmark(m_ * m_ * num_iters); } + void typeCasting(int num_iters) { + eigen_assert(m_ == n_); + const Eigen::array sizes = {{m_, k_}}; + const TensorMap, Eigen::Aligned> A(a_, sizes); + TensorMap, Eigen::Aligned> B((int*)b_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.cast(); + } + // Record the number of values copied per second + finalizeBenchmark(m_ * k_ * num_iters); + } + void random(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); const Eigen::array sizes = {{m_, m_}}; @@ -87,6 +101,34 @@ template class BenchmarkSuite { finalizeBenchmark(m_ * m_ * num_iters); } + void rowChip(int num_iters) { + const Eigen::array input_size = {{k_, n_}}; + const TensorMap, Eigen::Aligned> B(b_, input_size); + const Eigen::array output_size = {{n_}}; + TensorMap, Eigen::Aligned> C(c_, output_size); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.chip(iter % k_, 0); + } + // Record the number of values copied from the rhs chip to the lhs. + finalizeBenchmark(n_ * num_iters); + } + + void colChip(int num_iters) { + const Eigen::array input_size= {{k_, n_}}; + const TensorMap, Eigen::Aligned> B(b_, input_size); + const Eigen::array output_size = {{n_}}; + TensorMap, Eigen::Aligned> C(c_, output_size); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.chip(iter % n_, 1); + } + // Record the number of values copied from the rhs chip to the lhs. + finalizeBenchmark(n_ * num_iters); + } + void shuffling(int num_iters) { eigen_assert(m_ == n_); const Eigen::array size_a = {{m_, k_}}; @@ -147,7 +189,6 @@ template class BenchmarkSuite { TensorMap, Eigen::Aligned> C(c_, size_c); #ifndef EIGEN_HAS_INDEX_LIST - // nvcc doesn't support cxx11 const Eigen::array broadcast = {{1, n_}}; #else // Take advantage of cxx11 to give the compiler information it can use to @@ -212,14 +253,20 @@ template class BenchmarkSuite { finalizeBenchmark(m_ * m_ * num_iters); } - // Simple reduction - void reduction(int num_iters) { + // Row reduction + void rowReduction(int num_iters) { const Eigen::array input_size = {{k_, n_}}; - const TensorMap, Eigen::Aligned> B(b_, input_size); + const TensorMap, Eigen::Aligned> B(b_, input_size); const Eigen::array output_size = {{n_}}; - TensorMap, Eigen::Aligned> C(c_, output_size); + TensorMap, Eigen::Aligned> C(c_, output_size); - const Eigen::array sum_along_dim = {{0}}; +#ifndef EIGEN_HAS_INDEX_LIST + const Eigen::array sum_along_dim(0); +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList> sum_along_dim; +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -227,7 +274,33 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(k_ * n_ * num_iters); + } + + // Column reduction + void colReduction(int num_iters) { + const Eigen::array input_size = {{k_, n_}}; + const TensorMap, Eigen::Aligned> B( + b_, input_size); + const Eigen::array output_size = {{k_}}; + TensorMap, Eigen::Aligned> C( + c_, output_size); + +#ifndef EIGEN_HAS_INDEX_LIST + const Eigen::array sum_along_dim = {{1}}; +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList> sum_along_dim; +#endif + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(k_ * n_ * num_iters); } // do a contraction which is equivalent to a matrix multiplication diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc index 248a63861..6754e1a32 100644 --- a/bench/tensors/tensor_benchmarks_cpu.cc +++ b/bench/tensors/tensor_benchmarks_cpu.cc @@ -22,6 +22,10 @@ BM_FuncCPU(memcpy, 4); BM_FuncCPU(memcpy, 8); BM_FuncCPU(memcpy, 12); +BM_FuncCPU(typeCasting, 4); +BM_FuncCPU(typeCasting, 8); +BM_FuncCPU(typeCasting, 12); + BM_FuncCPU(random, 4); BM_FuncCPU(random, 8); BM_FuncCPU(random, 12); @@ -30,6 +34,14 @@ BM_FuncCPU(slicing, 4); BM_FuncCPU(slicing, 8); BM_FuncCPU(slicing, 12); +BM_FuncCPU(rowChip, 4); +BM_FuncCPU(rowChip, 8); +BM_FuncCPU(rowChip, 12); + +BM_FuncCPU(colChip, 4); +BM_FuncCPU(colChip, 8); +BM_FuncCPU(colChip, 12); + BM_FuncCPU(shuffling, 4); BM_FuncCPU(shuffling, 8); BM_FuncCPU(shuffling, 12); @@ -58,9 +70,13 @@ BM_FuncCPU(transcendentalFunc, 4); BM_FuncCPU(transcendentalFunc, 8); BM_FuncCPU(transcendentalFunc, 12); -BM_FuncCPU(reduction, 4); -BM_FuncCPU(reduction, 8); -BM_FuncCPU(reduction, 12); +BM_FuncCPU(rowReduction, 4); +BM_FuncCPU(rowReduction, 8); +BM_FuncCPU(rowReduction, 12); + +BM_FuncCPU(colReduction, 4); +BM_FuncCPU(colReduction, 8); +BM_FuncCPU(colReduction, 12); // Contractions @@ -98,6 +114,12 @@ BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8); BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12); BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16); + BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1); BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4); BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8); diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu index fbb486efd..fe807d2ab 100644 --- a/bench/tensors/tensor_benchmarks_gpu.cu +++ b/bench/tensors/tensor_benchmarks_gpu.cu @@ -19,6 +19,7 @@ BENCHMARK_RANGE(BM_##FUNC, 10, 5000); BM_FuncGPU(memcpy); +BM_FuncGPU(typeCasting); BM_FuncGPU(random); BM_FuncGPU(slicing); BM_FuncGPU(shuffling); @@ -26,7 +27,10 @@ BM_FuncGPU(padding); BM_FuncGPU(striding); BM_FuncGPU(broadcasting); BM_FuncGPU(coeffWiseOp); -BM_FuncGPU(reduction); +BM_FuncGPU(algebraicFunc); +BM_FuncGPU(transcendentalFunc); +BM_FuncGPU(rowReduction); +BM_FuncGPU(colReduction); // Contractions @@ -45,6 +49,7 @@ BM_FuncGPU(reduction); BM_FuncWithInputDimsGPU(contraction, N, N, N); BM_FuncWithInputDimsGPU(contraction, 64, N, N); BM_FuncWithInputDimsGPU(contraction, N, 64, N); +BM_FuncWithInputDimsGPU(contraction, N, N, 64); // Convolutions -- cgit v1.2.3