tensorflow/core/kernels/reduction_ops_test.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73

#include "tensorflow/core/public/tensor.h"
#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
#include "tensorflow/core/platform/test_benchmark.h"
#include <gtest/gtest.h>

namespace tensorflow {

// Creates a Graph which "reduce"s a 3D float tensor of "num" elements
// into a scalar.
static Graph* ToScalar(const string& reduce, int num) {
  Graph* g = new Graph(OpRegistry::Global());
  Tensor data(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
  data.flat<float>().setRandom();
  Tensor axes(DT_INT32, TensorShape({3}));
  axes.flat<int32>()(0) = 0;
  axes.flat<int32>()(1) = 1;
  axes.flat<int32>()(2) = 2;
  test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
                      test::graph::Constant(g, axes));
  return g;
}

// Creates a bench which reduces a 3D tensor with total "num" floats
// into a scalar on a "device". Runs the bench for "iters" times.
static void ReduceToScalar(int iters, const string& device,
                           const string& reduce, int num) {
  testing::ItemsProcessed(static_cast<int64>(iters) * num);
  testing::BytesProcessed(static_cast<int64>(iters) * num * sizeof(float));
  test::Benchmark(device, ToScalar(reduce, num)).Run(iters);
}

static void BM_Sum3DToScalarCPU(int iters, int num) {
  ReduceToScalar(iters, "cpu", "Sum", num);
}
BENCHMARK(BM_Sum3DToScalarCPU)->Range(1 << 13, 1 << 20);

static void BM_Max3DToScalarCPU(int iters, int num) {
  ReduceToScalar(iters, "cpu", "Max", num);
}
BENCHMARK(BM_Max3DToScalarCPU)->Range(1 << 13, 1 << 20);

static void BM_Prod3DToScalarCPU(int iters, int num) {
  ReduceToScalar(iters, "cpu", "Prod", num);
}
BENCHMARK(BM_Prod3DToScalarCPU)->Range(1 << 13, 1 << 20);

static void BM_Mean3DToScalarCPU(int iters, int num) {
  ReduceToScalar(iters, "cpu", "Mean", num);
}
BENCHMARK(BM_Mean3DToScalarCPU)->Range(1 << 13, 1 << 20);

static void BM_Sum3DToScalarGPU(int iters, int num) {
  ReduceToScalar(iters, "gpu", "Sum", num);
}
BENCHMARK(BM_Sum3DToScalarGPU)->Range(1 << 13, 1 << 20);

static void BM_Max3DToScalarGPU(int iters, int num) {
  ReduceToScalar(iters, "gpu", "Max", num);
}
BENCHMARK(BM_Max3DToScalarGPU)->Range(1 << 13, 1 << 20);

static void BM_Prod3DToScalarGPU(int iters, int num) {
  ReduceToScalar(iters, "gpu", "Prod", num);
}
BENCHMARK(BM_Prod3DToScalarGPU)->Range(1 << 13, 1 << 20);

// Once Mean is available on GPU, enable this.
// static void BM_Mean3DToScalarGPU(int iters, int num) {
//   ReduceToScalar(iters, "gpu", "Mean", num);
// }
// BENCHMARK(BM_Mean3DToScalarGPU)->Range(1 << 13, 1 << 20);

}  // end namespace tensorflow