From c8d5f21941a41556f94e937ea5a91badb7fb9353 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 28 Jan 2016 16:20:36 -0800 Subject: Added extra tensor benchmarks --- bench/tensors/tensor_benchmarks.h | 87 +++++++++++++++++++++++++++++++++++---- 1 file changed, 80 insertions(+), 7 deletions(-) (limited to 'bench/tensors/tensor_benchmarks.h') diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 071326aa7..6b9d13446 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -45,6 +45,20 @@ template class BenchmarkSuite { finalizeBenchmark(m_ * m_ * num_iters); } + void typeCasting(int num_iters) { + eigen_assert(m_ == n_); + const Eigen::array sizes = {{m_, k_}}; + const TensorMap, Eigen::Aligned> A(a_, sizes); + TensorMap, Eigen::Aligned> B((int*)b_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.cast(); + } + // Record the number of values copied per second + finalizeBenchmark(m_ * k_ * num_iters); + } + void random(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); const Eigen::array sizes = {{m_, m_}}; @@ -87,6 +101,34 @@ template class BenchmarkSuite { finalizeBenchmark(m_ * m_ * num_iters); } + void rowChip(int num_iters) { + const Eigen::array input_size = {{k_, n_}}; + const TensorMap, Eigen::Aligned> B(b_, input_size); + const Eigen::array output_size = {{n_}}; + TensorMap, Eigen::Aligned> C(c_, output_size); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.chip(iter % k_, 0); + } + // Record the number of values copied from the rhs chip to the lhs. + finalizeBenchmark(n_ * num_iters); + } + + void colChip(int num_iters) { + const Eigen::array input_size= {{k_, n_}}; + const TensorMap, Eigen::Aligned> B(b_, input_size); + const Eigen::array output_size = {{n_}}; + TensorMap, Eigen::Aligned> C(c_, output_size); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.chip(iter % n_, 1); + } + // Record the number of values copied from the rhs chip to the lhs. + finalizeBenchmark(n_ * num_iters); + } + void shuffling(int num_iters) { eigen_assert(m_ == n_); const Eigen::array size_a = {{m_, k_}}; @@ -147,7 +189,6 @@ template class BenchmarkSuite { TensorMap, Eigen::Aligned> C(c_, size_c); #ifndef EIGEN_HAS_INDEX_LIST - // nvcc doesn't support cxx11 const Eigen::array broadcast = {{1, n_}}; #else // Take advantage of cxx11 to give the compiler information it can use to @@ -212,14 +253,20 @@ template class BenchmarkSuite { finalizeBenchmark(m_ * m_ * num_iters); } - // Simple reduction - void reduction(int num_iters) { + // Row reduction + void rowReduction(int num_iters) { const Eigen::array input_size = {{k_, n_}}; - const TensorMap, Eigen::Aligned> B(b_, input_size); + const TensorMap, Eigen::Aligned> B(b_, input_size); const Eigen::array output_size = {{n_}}; - TensorMap, Eigen::Aligned> C(c_, output_size); + TensorMap, Eigen::Aligned> C(c_, output_size); - const Eigen::array sum_along_dim = {{0}}; +#ifndef EIGEN_HAS_INDEX_LIST + const Eigen::array sum_along_dim(0); +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList> sum_along_dim; +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -227,7 +274,33 @@ template class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(k_ * n_ * num_iters); + } + + // Column reduction + void colReduction(int num_iters) { + const Eigen::array input_size = {{k_, n_}}; + const TensorMap, Eigen::Aligned> B( + b_, input_size); + const Eigen::array output_size = {{k_}}; + TensorMap, Eigen::Aligned> C( + c_, output_size); + +#ifndef EIGEN_HAS_INDEX_LIST + const Eigen::array sum_along_dim = {{1}}; +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList> sum_along_dim; +#endif + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(k_ * n_ * num_iters); } // do a contraction which is equivalent to a matrix multiplication -- cgit v1.2.3