diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2016-01-28 16:20:36 -0800 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2016-01-28 16:20:36 -0800 |
commit | c8d5f21941a41556f94e937ea5a91badb7fb9353 (patch) | |
tree | 163d3b29f6ce464803dfa5831aa3e31f91c8c7c8 /bench/tensors | |
parent | 7b3044d086f413fdaf65acd30fc3bc469d43ccc6 (diff) |
Added extra tensor benchmarks
Diffstat (limited to 'bench/tensors')
-rw-r--r-- | bench/tensors/tensor_benchmarks.h | 87 | ||||
-rw-r--r-- | bench/tensors/tensor_benchmarks_cpu.cc | 28 | ||||
-rw-r--r-- | bench/tensors/tensor_benchmarks_gpu.cu | 7 |
3 files changed, 111 insertions, 11 deletions
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 071326aa7..6b9d13446 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -45,6 +45,20 @@ template <typename Device> class BenchmarkSuite { finalizeBenchmark(m_ * m_ * num_iters); } + void typeCasting(int num_iters) { + eigen_assert(m_ == n_); + const Eigen::array<TensorIndex, 2> sizes = {{m_, k_}}; + const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> A(a_, sizes); + TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> B((int*)b_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.cast<int>(); + } + // Record the number of values copied per second + finalizeBenchmark(m_ * k_ * num_iters); + } + void random(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}}; @@ -87,6 +101,34 @@ template <typename Device> class BenchmarkSuite { finalizeBenchmark(m_ * m_ * num_iters); } + void rowChip(int num_iters) { + const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}}; + const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); + const Eigen::array<TensorIndex, 1> output_size = {{n_}}; + TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.chip(iter % k_, 0); + } + // Record the number of values copied from the rhs chip to the lhs. + finalizeBenchmark(n_ * num_iters); + } + + void colChip(int num_iters) { + const Eigen::array<TensorIndex, 2> input_size= {{k_, n_}}; + const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); + const Eigen::array<TensorIndex, 1> output_size = {{n_}}; + TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.chip(iter % n_, 1); + } + // Record the number of values copied from the rhs chip to the lhs. + finalizeBenchmark(n_ * num_iters); + } + void shuffling(int num_iters) { eigen_assert(m_ == n_); const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}}; @@ -147,7 +189,6 @@ template <typename Device> class BenchmarkSuite { TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c); #ifndef EIGEN_HAS_INDEX_LIST - // nvcc doesn't support cxx11 const Eigen::array<int, 2> broadcast = {{1, n_}}; #else // Take advantage of cxx11 to give the compiler information it can use to @@ -212,14 +253,20 @@ template <typename Device> class BenchmarkSuite { finalizeBenchmark(m_ * m_ * num_iters); } - // Simple reduction - void reduction(int num_iters) { + // Row reduction + void rowReduction(int num_iters) { const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}}; - const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size); + const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); const Eigen::array<TensorIndex, 1> output_size = {{n_}}; - TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size); + TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); - const Eigen::array<TensorIndex, 1> sum_along_dim = {{0}}; +#ifndef EIGEN_HAS_INDEX_LIST + const Eigen::array<TensorIndex, 1> sum_along_dim(0); +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList<Eigen::type2index<0>> sum_along_dim; +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -227,7 +274,33 @@ template <typename Device> class BenchmarkSuite { } // Record the number of FLOP executed per second (assuming one operation // per value) - finalizeBenchmark(m_ * m_ * num_iters); + finalizeBenchmark(k_ * n_ * num_iters); + } + + // Column reduction + void colReduction(int num_iters) { + const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}}; + const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B( + b_, input_size); + const Eigen::array<TensorIndex, 1> output_size = {{k_}}; + TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C( + c_, output_size); + +#ifndef EIGEN_HAS_INDEX_LIST + const Eigen::array<TensorIndex, 1> sum_along_dim = {{1}}; +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList<Eigen::type2index<1>> sum_along_dim; +#endif + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(k_ * n_ * num_iters); } // do a contraction which is equivalent to a matrix multiplication diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc index 248a63861..6754e1a32 100644 --- a/bench/tensors/tensor_benchmarks_cpu.cc +++ b/bench/tensors/tensor_benchmarks_cpu.cc @@ -22,6 +22,10 @@ BM_FuncCPU(memcpy, 4); BM_FuncCPU(memcpy, 8); BM_FuncCPU(memcpy, 12); +BM_FuncCPU(typeCasting, 4); +BM_FuncCPU(typeCasting, 8); +BM_FuncCPU(typeCasting, 12); + BM_FuncCPU(random, 4); BM_FuncCPU(random, 8); BM_FuncCPU(random, 12); @@ -30,6 +34,14 @@ BM_FuncCPU(slicing, 4); BM_FuncCPU(slicing, 8); BM_FuncCPU(slicing, 12); +BM_FuncCPU(rowChip, 4); +BM_FuncCPU(rowChip, 8); +BM_FuncCPU(rowChip, 12); + +BM_FuncCPU(colChip, 4); +BM_FuncCPU(colChip, 8); +BM_FuncCPU(colChip, 12); + BM_FuncCPU(shuffling, 4); BM_FuncCPU(shuffling, 8); BM_FuncCPU(shuffling, 12); @@ -58,9 +70,13 @@ BM_FuncCPU(transcendentalFunc, 4); BM_FuncCPU(transcendentalFunc, 8); BM_FuncCPU(transcendentalFunc, 12); -BM_FuncCPU(reduction, 4); -BM_FuncCPU(reduction, 8); -BM_FuncCPU(reduction, 12); +BM_FuncCPU(rowReduction, 4); +BM_FuncCPU(rowReduction, 8); +BM_FuncCPU(rowReduction, 12); + +BM_FuncCPU(colReduction, 4); +BM_FuncCPU(colReduction, 8); +BM_FuncCPU(colReduction, 12); // Contractions @@ -98,6 +114,12 @@ BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8); BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12); BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12); +BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16); + BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1); BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4); BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8); diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu index fbb486efd..fe807d2ab 100644 --- a/bench/tensors/tensor_benchmarks_gpu.cu +++ b/bench/tensors/tensor_benchmarks_gpu.cu @@ -19,6 +19,7 @@ BENCHMARK_RANGE(BM_##FUNC, 10, 5000); BM_FuncGPU(memcpy); +BM_FuncGPU(typeCasting); BM_FuncGPU(random); BM_FuncGPU(slicing); BM_FuncGPU(shuffling); @@ -26,7 +27,10 @@ BM_FuncGPU(padding); BM_FuncGPU(striding); BM_FuncGPU(broadcasting); BM_FuncGPU(coeffWiseOp); -BM_FuncGPU(reduction); +BM_FuncGPU(algebraicFunc); +BM_FuncGPU(transcendentalFunc); +BM_FuncGPU(rowReduction); +BM_FuncGPU(colReduction); // Contractions @@ -45,6 +49,7 @@ BM_FuncGPU(reduction); BM_FuncWithInputDimsGPU(contraction, N, N, N); BM_FuncWithInputDimsGPU(contraction, 64, N, N); BM_FuncWithInputDimsGPU(contraction, N, 64, N); +BM_FuncWithInputDimsGPU(contraction, N, N, 64); // Convolutions |