aboutsummaryrefslogtreecommitdiffhomepage
path: root/bench/tensors
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-01-28 16:20:36 -0800
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-01-28 16:20:36 -0800
commitc8d5f21941a41556f94e937ea5a91badb7fb9353 (patch)
tree163d3b29f6ce464803dfa5831aa3e31f91c8c7c8 /bench/tensors
parent7b3044d086f413fdaf65acd30fc3bc469d43ccc6 (diff)
Added extra tensor benchmarks
Diffstat (limited to 'bench/tensors')
-rw-r--r--bench/tensors/tensor_benchmarks.h87
-rw-r--r--bench/tensors/tensor_benchmarks_cpu.cc28
-rw-r--r--bench/tensors/tensor_benchmarks_gpu.cu7
3 files changed, 111 insertions, 11 deletions
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
index 071326aa7..6b9d13446 100644
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@@ -45,6 +45,20 @@ template <typename Device> class BenchmarkSuite {
finalizeBenchmark(m_ * m_ * num_iters);
}
+ void typeCasting(int num_iters) {
+ eigen_assert(m_ == n_);
+ const Eigen::array<TensorIndex, 2> sizes = {{m_, k_}};
+ const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> A(a_, sizes);
+ TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> B((int*)b_, sizes);
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ B.device(device_) = A.cast<int>();
+ }
+ // Record the number of values copied per second
+ finalizeBenchmark(m_ * k_ * num_iters);
+ }
+
void random(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
@@ -87,6 +101,34 @@ template <typename Device> class BenchmarkSuite {
finalizeBenchmark(m_ * m_ * num_iters);
}
+ void rowChip(int num_iters) {
+ const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
+ const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+ const Eigen::array<TensorIndex, 1> output_size = {{n_}};
+ TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ C.device(device_) = B.chip(iter % k_, 0);
+ }
+ // Record the number of values copied from the rhs chip to the lhs.
+ finalizeBenchmark(n_ * num_iters);
+ }
+
+ void colChip(int num_iters) {
+ const Eigen::array<TensorIndex, 2> input_size= {{k_, n_}};
+ const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
+ const Eigen::array<TensorIndex, 1> output_size = {{n_}};
+ TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ C.device(device_) = B.chip(iter % n_, 1);
+ }
+ // Record the number of values copied from the rhs chip to the lhs.
+ finalizeBenchmark(n_ * num_iters);
+ }
+
void shuffling(int num_iters) {
eigen_assert(m_ == n_);
const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
@@ -147,7 +189,6 @@ template <typename Device> class BenchmarkSuite {
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
#ifndef EIGEN_HAS_INDEX_LIST
- // nvcc doesn't support cxx11
const Eigen::array<int, 2> broadcast = {{1, n_}};
#else
// Take advantage of cxx11 to give the compiler information it can use to
@@ -212,14 +253,20 @@ template <typename Device> class BenchmarkSuite {
finalizeBenchmark(m_ * m_ * num_iters);
}
- // Simple reduction
- void reduction(int num_iters) {
+ // Row reduction
+ void rowReduction(int num_iters) {
const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
- const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size);
+ const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
const Eigen::array<TensorIndex, 1> output_size = {{n_}};
- TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size);
+ TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
- const Eigen::array<TensorIndex, 1> sum_along_dim = {{0}};
+#ifndef EIGEN_HAS_INDEX_LIST
+ const Eigen::array<TensorIndex, 1> sum_along_dim(0);
+#else
+ // Take advantage of cxx11 to give the compiler information it can use to
+ // optimize the code.
+ Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
+#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
@@ -227,7 +274,33 @@ template <typename Device> class BenchmarkSuite {
}
// Record the number of FLOP executed per second (assuming one operation
// per value)
- finalizeBenchmark(m_ * m_ * num_iters);
+ finalizeBenchmark(k_ * n_ * num_iters);
+ }
+
+ // Column reduction
+ void colReduction(int num_iters) {
+ const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
+ const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(
+ b_, input_size);
+ const Eigen::array<TensorIndex, 1> output_size = {{k_}};
+ TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(
+ c_, output_size);
+
+#ifndef EIGEN_HAS_INDEX_LIST
+ const Eigen::array<TensorIndex, 1> sum_along_dim = {{1}};
+#else
+ // Take advantage of cxx11 to give the compiler information it can use to
+ // optimize the code.
+ Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
+#endif
+
+ StartBenchmarkTiming();
+ for (int iter = 0; iter < num_iters; ++iter) {
+ C.device(device_) = B.sum(sum_along_dim);
+ }
+ // Record the number of FLOP executed per second (assuming one operation
+ // per value)
+ finalizeBenchmark(k_ * n_ * num_iters);
}
// do a contraction which is equivalent to a matrix multiplication
diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc
index 248a63861..6754e1a32 100644
--- a/bench/tensors/tensor_benchmarks_cpu.cc
+++ b/bench/tensors/tensor_benchmarks_cpu.cc
@@ -22,6 +22,10 @@ BM_FuncCPU(memcpy, 4);
BM_FuncCPU(memcpy, 8);
BM_FuncCPU(memcpy, 12);
+BM_FuncCPU(typeCasting, 4);
+BM_FuncCPU(typeCasting, 8);
+BM_FuncCPU(typeCasting, 12);
+
BM_FuncCPU(random, 4);
BM_FuncCPU(random, 8);
BM_FuncCPU(random, 12);
@@ -30,6 +34,14 @@ BM_FuncCPU(slicing, 4);
BM_FuncCPU(slicing, 8);
BM_FuncCPU(slicing, 12);
+BM_FuncCPU(rowChip, 4);
+BM_FuncCPU(rowChip, 8);
+BM_FuncCPU(rowChip, 12);
+
+BM_FuncCPU(colChip, 4);
+BM_FuncCPU(colChip, 8);
+BM_FuncCPU(colChip, 12);
+
BM_FuncCPU(shuffling, 4);
BM_FuncCPU(shuffling, 8);
BM_FuncCPU(shuffling, 12);
@@ -58,9 +70,13 @@ BM_FuncCPU(transcendentalFunc, 4);
BM_FuncCPU(transcendentalFunc, 8);
BM_FuncCPU(transcendentalFunc, 12);
-BM_FuncCPU(reduction, 4);
-BM_FuncCPU(reduction, 8);
-BM_FuncCPU(reduction, 12);
+BM_FuncCPU(rowReduction, 4);
+BM_FuncCPU(rowReduction, 8);
+BM_FuncCPU(rowReduction, 12);
+
+BM_FuncCPU(colReduction, 4);
+BM_FuncCPU(colReduction, 8);
+BM_FuncCPU(colReduction, 12);
// Contractions
@@ -98,6 +114,12 @@ BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12);
+BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16);
+
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu
index fbb486efd..fe807d2ab 100644
--- a/bench/tensors/tensor_benchmarks_gpu.cu
+++ b/bench/tensors/tensor_benchmarks_gpu.cu
@@ -19,6 +19,7 @@
BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
BM_FuncGPU(memcpy);
+BM_FuncGPU(typeCasting);
BM_FuncGPU(random);
BM_FuncGPU(slicing);
BM_FuncGPU(shuffling);
@@ -26,7 +27,10 @@ BM_FuncGPU(padding);
BM_FuncGPU(striding);
BM_FuncGPU(broadcasting);
BM_FuncGPU(coeffWiseOp);
-BM_FuncGPU(reduction);
+BM_FuncGPU(algebraicFunc);
+BM_FuncGPU(transcendentalFunc);
+BM_FuncGPU(rowReduction);
+BM_FuncGPU(colReduction);
// Contractions
@@ -45,6 +49,7 @@ BM_FuncGPU(reduction);
BM_FuncWithInputDimsGPU(contraction, N, N, N);
BM_FuncWithInputDimsGPU(contraction, 64, N, N);
BM_FuncWithInputDimsGPU(contraction, N, 64, N);
+BM_FuncWithInputDimsGPU(contraction, N, N, 64);
// Convolutions