1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
|
#include "tensorflow/core/public/tensor.h"
#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
#include "tensorflow/core/platform/test_benchmark.h"
#include <gtest/gtest.h>
namespace tensorflow {
// Creates a Graph which "reduce"s a 3D float tensor of "num" elements
// into a scalar.
static Graph* ToScalar(const string& reduce, int num) {
Graph* g = new Graph(OpRegistry::Global());
Tensor data(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
data.flat<float>().setRandom();
Tensor axes(DT_INT32, TensorShape({3}));
axes.flat<int32>()(0) = 0;
axes.flat<int32>()(1) = 1;
axes.flat<int32>()(2) = 2;
test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
test::graph::Constant(g, axes));
return g;
}
// Creates a bench which reduces a 3D tensor with total "num" floats
// into a scalar on a "device". Runs the bench for "iters" times.
static void ReduceToScalar(int iters, const string& device,
const string& reduce, int num) {
testing::ItemsProcessed(static_cast<int64>(iters) * num);
testing::BytesProcessed(static_cast<int64>(iters) * num * sizeof(float));
test::Benchmark(device, ToScalar(reduce, num)).Run(iters);
}
static void BM_Sum3DToScalarCPU(int iters, int num) {
ReduceToScalar(iters, "cpu", "Sum", num);
}
BENCHMARK(BM_Sum3DToScalarCPU)->Range(1 << 13, 1 << 20);
static void BM_Max3DToScalarCPU(int iters, int num) {
ReduceToScalar(iters, "cpu", "Max", num);
}
BENCHMARK(BM_Max3DToScalarCPU)->Range(1 << 13, 1 << 20);
static void BM_Prod3DToScalarCPU(int iters, int num) {
ReduceToScalar(iters, "cpu", "Prod", num);
}
BENCHMARK(BM_Prod3DToScalarCPU)->Range(1 << 13, 1 << 20);
static void BM_Mean3DToScalarCPU(int iters, int num) {
ReduceToScalar(iters, "cpu", "Mean", num);
}
BENCHMARK(BM_Mean3DToScalarCPU)->Range(1 << 13, 1 << 20);
static void BM_Sum3DToScalarGPU(int iters, int num) {
ReduceToScalar(iters, "gpu", "Sum", num);
}
BENCHMARK(BM_Sum3DToScalarGPU)->Range(1 << 13, 1 << 20);
static void BM_Max3DToScalarGPU(int iters, int num) {
ReduceToScalar(iters, "gpu", "Max", num);
}
BENCHMARK(BM_Max3DToScalarGPU)->Range(1 << 13, 1 << 20);
static void BM_Prod3DToScalarGPU(int iters, int num) {
ReduceToScalar(iters, "gpu", "Prod", num);
}
BENCHMARK(BM_Prod3DToScalarGPU)->Range(1 << 13, 1 << 20);
// Once Mean is available on GPU, enable this.
// static void BM_Mean3DToScalarGPU(int iters, int num) {
// ReduceToScalar(iters, "gpu", "Mean", num);
// }
// BENCHMARK(BM_Mean3DToScalarGPU)->Range(1 << 13, 1 << 20);
} // end namespace tensorflow
|