aboutsummaryrefslogtreecommitdiffhomepage
path: root/bench/tensors
diff options
context:
space:
mode:
authorGravatar Yangqing Jia <me@daggerfs.com>2016-01-28 10:35:14 -0800
committerGravatar Yangqing Jia <me@daggerfs.com>2016-01-28 10:35:14 -0800
commitc4e47630b16a716d01dc20b36afa8882b03681a1 (patch)
treefe2cd8765e7264da3a48712fcbbeddd0733780ef /bench/tensors
parent4865e1e73265e12d564f8b4d9069a2159f777d90 (diff)
benchmark modifications to make it compilable in a standalone fashion.
Diffstat (limited to 'bench/tensors')
-rw-r--r--bench/tensors/benchmark.h48
-rw-r--r--bench/tensors/benchmark_main.cc215
-rw-r--r--bench/tensors/tensor_benchmarks.h87
-rw-r--r--bench/tensors/tensor_benchmarks_cpu.cc20
-rw-r--r--bench/tensors/tensor_benchmarks_gpu.cc4
5 files changed, 318 insertions, 56 deletions
diff --git a/bench/tensors/benchmark.h b/bench/tensors/benchmark.h
new file mode 100644
index 000000000..d8b4fd4c6
--- /dev/null
+++ b/bench/tensors/benchmark.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdint.h>
+#include <vector>
+
+namespace testing {
+class Benchmark {
+ public:
+ Benchmark(const char* name, void (*fn)(int)) {
+ Register(name, fn, NULL);
+ }
+ Benchmark(const char* name, void (*fn_range)(int, int)) {
+ Register(name, NULL, fn_range);
+ }
+ Benchmark* Arg(int x);
+ Benchmark* Range(int lo, int hi);
+ const char* Name();
+ bool ShouldRun(int argc, char* argv[]);
+ void Run();
+ private:
+ const char* name_;
+ void (*fn_)(int);
+ void (*fn_range_)(int, int);
+ std::vector<int> args_;
+ void Register(const char* name, void (*fn)(int), void (*fn_range)(int, int));
+ void RunRepeatedlyWithArg(int iterations, int arg);
+ void RunWithArg(int arg);
+};
+} // namespace testing
+void SetBenchmarkBytesProcessed(int64_t);
+void StopBenchmarkTiming();
+void StartBenchmarkTiming();
+#define BENCHMARK(f) \
+ static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \
+ (new ::testing::Benchmark(#f, f)) \ No newline at end of file
diff --git a/bench/tensors/benchmark_main.cc b/bench/tensors/benchmark_main.cc
new file mode 100644
index 000000000..0fc12960e
--- /dev/null
+++ b/bench/tensors/benchmark_main.cc
@@ -0,0 +1,215 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "benchmark.h"
+#include <regex.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <inttypes.h>
+#include <time.h>
+#include <map>
+
+static int64_t g_bytes_processed;
+static int64_t g_benchmark_total_time_ns;
+static int64_t g_benchmark_start_time_ns;
+typedef std::map<std::string, ::testing::Benchmark*> BenchmarkMap;
+typedef BenchmarkMap::iterator BenchmarkMapIt;
+static BenchmarkMap g_benchmarks;
+static int g_name_column_width = 20;
+static int Round(int n) {
+ int base = 1;
+ while (base*10 < n) {
+ base *= 10;
+ }
+ if (n < 2*base) {
+ return 2*base;
+ }
+ if (n < 5*base) {
+ return 5*base;
+ }
+ return 10*base;
+}
+static int64_t NanoTime() {
+ struct timespec t;
+ t.tv_sec = t.tv_nsec = 0;
+ clock_gettime(CLOCK_MONOTONIC, &t);
+ return static_cast<int64_t>(t.tv_sec) * 1000000000LL + t.tv_nsec;
+}
+namespace testing {
+Benchmark* Benchmark::Arg(int arg) {
+ args_.push_back(arg);
+ return this;
+}
+
+Benchmark* Benchmark::Range(int lo, int hi) {
+ const int kRangeMultiplier = 8;
+ if (hi < lo) {
+ int temp = hi;
+ hi = lo;
+ lo = temp;
+ }
+ while (lo < hi) {
+ args_.push_back(lo);
+ lo *= kRangeMultiplier;
+ }
+ // We always run the hi number.
+ args_.push_back(hi);
+ return this;
+}
+
+const char* Benchmark::Name() {
+ return name_;
+}
+bool Benchmark::ShouldRun(int argc, char* argv[]) {
+ if (argc == 1) {
+ return true; // With no arguments, we run all benchmarks.
+ }
+ // Otherwise, we interpret each argument as a regular expression and
+ // see if any of our benchmarks match.
+ for (int i = 1; i < argc; i++) {
+ regex_t re;
+ if (regcomp(&re, argv[i], 0) != 0) {
+ fprintf(stderr, "couldn't compile \"%s\" as a regular expression!\n", argv[i]);
+ exit(EXIT_FAILURE);
+ }
+ int match = regexec(&re, name_, 0, NULL, 0);
+ regfree(&re);
+ if (match != REG_NOMATCH) {
+ return true;
+ }
+ }
+ return false;
+}
+void Benchmark::Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)) {
+ name_ = name;
+ fn_ = fn;
+ fn_range_ = fn_range;
+ if (fn_ == NULL && fn_range_ == NULL) {
+ fprintf(stderr, "%s: missing function\n", name_);
+ exit(EXIT_FAILURE);
+ }
+ g_benchmarks.insert(std::make_pair(name, this));
+}
+void Benchmark::Run() {
+ if (fn_ != NULL) {
+ RunWithArg(0);
+ } else {
+ if (args_.empty()) {
+ fprintf(stderr, "%s: no args!\n", name_);
+ exit(EXIT_FAILURE);
+ }
+ for (size_t i = 0; i < args_.size(); ++i) {
+ RunWithArg(args_[i]);
+ }
+ }
+}
+void Benchmark::RunRepeatedlyWithArg(int iterations, int arg) {
+ g_bytes_processed = 0;
+ g_benchmark_total_time_ns = 0;
+ g_benchmark_start_time_ns = NanoTime();
+ if (fn_ != NULL) {
+ fn_(iterations);
+ } else {
+ fn_range_(iterations, arg);
+ }
+ if (g_benchmark_start_time_ns != 0) {
+ g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
+ }
+}
+void Benchmark::RunWithArg(int arg) {
+ // run once in case it's expensive
+ int iterations = 1;
+ RunRepeatedlyWithArg(iterations, arg);
+ while (g_benchmark_total_time_ns < 1e9 && iterations < 1e9) {
+ int last = iterations;
+ if (g_benchmark_total_time_ns/iterations == 0) {
+ iterations = 1e9;
+ } else {
+ iterations = 1e9 / (g_benchmark_total_time_ns/iterations);
+ }
+ iterations = std::max(last + 1, std::min(iterations + iterations/2, 100*last));
+ iterations = Round(iterations);
+ RunRepeatedlyWithArg(iterations, arg);
+ }
+ char throughput[100];
+ throughput[0] = '\0';
+ if (g_benchmark_total_time_ns > 0 && g_bytes_processed > 0) {
+ double mib_processed = static_cast<double>(g_bytes_processed)/1e6;
+ double seconds = static_cast<double>(g_benchmark_total_time_ns)/1e9;
+ snprintf(throughput, sizeof(throughput), " %8.2f MiB/s", mib_processed/seconds);
+ }
+ char full_name[100];
+ if (fn_range_ != NULL) {
+ if (arg >= (1<<20)) {
+ snprintf(full_name, sizeof(full_name), "%s/%dM", name_, arg/(1<<20));
+ } else if (arg >= (1<<10)) {
+ snprintf(full_name, sizeof(full_name), "%s/%dK", name_, arg/(1<<10));
+ } else {
+ snprintf(full_name, sizeof(full_name), "%s/%d", name_, arg);
+ }
+ } else {
+ snprintf(full_name, sizeof(full_name), "%s", name_);
+ }
+ printf("%-*s %10d %10" PRId64 "%s\n", g_name_column_width, full_name,
+ iterations, g_benchmark_total_time_ns/iterations, throughput);
+ fflush(stdout);
+}
+} // namespace testing
+void SetBenchmarkBytesProcessed(int64_t x) {
+ g_bytes_processed = x;
+}
+void StopBenchmarkTiming() {
+ if (g_benchmark_start_time_ns != 0) {
+ g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns;
+ }
+ g_benchmark_start_time_ns = 0;
+}
+void StartBenchmarkTiming() {
+ if (g_benchmark_start_time_ns == 0) {
+ g_benchmark_start_time_ns = NanoTime();
+ }
+}
+int main(int argc, char* argv[]) {
+ if (g_benchmarks.empty()) {
+ fprintf(stderr, "No benchmarks registered!\n");
+ exit(EXIT_FAILURE);
+ }
+ for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) {
+ int name_width = static_cast<int>(strlen(it->second->Name()));
+ g_name_column_width = std::max(g_name_column_width, name_width);
+ }
+ bool need_header = true;
+ for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) {
+ ::testing::Benchmark* b = it->second;
+ if (b->ShouldRun(argc, argv)) {
+ if (need_header) {
+ printf("%-*s %10s %10s\n", g_name_column_width, "", "iterations", "ns/op");
+ fflush(stdout);
+ need_header = false;
+ }
+ b->Run();
+ }
+ }
+ if (need_header) {
+ fprintf(stderr, "No matching benchmarks!\n");
+ fprintf(stderr, "Available benchmarks:\n");
+ for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) {
+ fprintf(stderr, " %s\n", it->second->Name());
+ }
+ exit(EXIT_FAILURE);
+ }
+ return 0;
+} \ No newline at end of file
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h
index 525b9acda..a1696afda 100644
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@@ -4,12 +4,23 @@
typedef int TensorIndex;
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "testing/base/public/benchmark.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "benchmark.h"
+
+#define BENCHMARK_RANGE(bench, lo, hi) \
+ BENCHMARK(bench)->Range(lo, hi)
+
+template <typename... Args>
+std::string StrCat(const Args... args) {
+ std::stringstream ss;
+ StrCatRecursive(ss, args...);
+ return ss.str();
+}
using Eigen::Tensor;
using Eigen::TensorMap;
+typedef int64_t int64;
// TODO(bsteiner): also templatize on the input type since we have users
// for int8 as well as floats.
@@ -43,7 +54,7 @@ template <typename Device> class BenchmarkSuite {
void random(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
- const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+ const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
StartBenchmarkTiming();
@@ -56,16 +67,16 @@ template <typename Device> class BenchmarkSuite {
void slicing(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
- const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+ const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
- const Eigen::DSizes<TensorIndex, 2> quarter_sizes(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
- const Eigen::DSizes<TensorIndex, 2> first_quadrant(Eigen::array<TensorIndex, 2>(0, 0));
- const Eigen::DSizes<TensorIndex, 2> second_quadrant(Eigen::array<TensorIndex, 2>(0, m_/2));
- const Eigen::DSizes<TensorIndex, 2> third_quadrant(Eigen::array<TensorIndex, 2>(m_/2, 0));
- const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
+ const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
+ const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
+ const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
+ const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
+ const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
@@ -85,12 +96,12 @@ template <typename Device> class BenchmarkSuite {
void shuffling(int num_iters) {
eigen_assert(m_ == n_);
- const Eigen::array<TensorIndex, 2> size_a(m_, k_);
+ const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
- const Eigen::array<TensorIndex, 2> size_b(k_, m_);
+ const Eigen::array<TensorIndex, 2> size_b = {{k_, m_}};
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
- const Eigen::array<int, 2> shuffle(1, 0);
+ const Eigen::array<int, 2> shuffle = {{1, 0}};
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
@@ -102,9 +113,9 @@ template <typename Device> class BenchmarkSuite {
void padding(int num_iters) {
eigen_assert(m_ == k_);
- const Eigen::array<TensorIndex, 2> size_a(m_, k_-3);
+ const Eigen::array<TensorIndex, 2> size_a = {{m_, k_-3}};
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
- const Eigen::array<TensorIndex, 2> size_b(k_, m_);
+ const Eigen::array<TensorIndex, 2> size_b = {{k_, m_}};
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
@@ -121,12 +132,12 @@ template <typename Device> class BenchmarkSuite {
void striding(int num_iters) {
eigen_assert(m_ == k_);
- const Eigen::array<TensorIndex, 2> size_a(m_, k_);
+ const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
- const Eigen::array<TensorIndex, 2> size_b(m_, k_ / 2);
+ const Eigen::array<TensorIndex, 2> size_b = {{m_, k_ / 2}};
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
- const Eigen::array<TensorIndex, 2> strides(1, 2);
+ const Eigen::array<TensorIndex, 2> strides = {{1, 2}};
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
@@ -137,14 +148,14 @@ template <typename Device> class BenchmarkSuite {
}
void broadcasting(int num_iters) {
- const Eigen::array<TensorIndex, 2> size_a(m_, 1);
+ const Eigen::array<TensorIndex, 2> size_a = {{m_, 1}};
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
- const Eigen::array<TensorIndex, 2> size_c(m_, n_);
+ const Eigen::array<TensorIndex, 2> size_c = {{m_, n_}};
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
-#if defined(__CUDACC__)
+#ifndef EIGEN_HAS_INDEX_LIST
// nvcc doesn't support cxx11
- const Eigen::array<int, 2> broadcast(1, n_);
+ const Eigen::array<int, 2> broadcast = {{1, n_}};
#else
// Take advantage of cxx11 to give the compiler information it can use to
// optimize the code.
@@ -162,7 +173,7 @@ template <typename Device> class BenchmarkSuite {
void coeffWiseOp(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
- const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+ const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
@@ -178,7 +189,7 @@ template <typename Device> class BenchmarkSuite {
void algebraicFunc(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
- const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+ const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
@@ -194,7 +205,7 @@ template <typename Device> class BenchmarkSuite {
void transcendentalFunc(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
- const Eigen::array<TensorIndex, 2> sizes(m_, m_);
+ const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
@@ -210,12 +221,12 @@ template <typename Device> class BenchmarkSuite {
// Simple reduction
void reduction(int num_iters) {
- const Eigen::array<TensorIndex, 2> input_size(k_, n_);
+ const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size);
- const Eigen::array<TensorIndex, 1> output_size(n_);
+ const Eigen::array<TensorIndex, 1> output_size = {{n_}};
TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size);
- const Eigen::array<TensorIndex, 1> sum_along_dim(0);
+ const Eigen::array<TensorIndex, 1> sum_along_dim = {{0}};
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
@@ -228,16 +239,16 @@ template <typename Device> class BenchmarkSuite {
// do a contraction which is equivalent to a matrix multiplication
void contraction(int num_iters) {
- const Eigen::array<TensorIndex, 2> sizeA(m_, k_);
- const Eigen::array<TensorIndex, 2> sizeB(k_, n_);
- const Eigen::array<TensorIndex, 2> sizeC(m_, n_);
+ const Eigen::array<TensorIndex, 2> sizeA = {{m_, k_}};
+ const Eigen::array<TensorIndex, 2> sizeB = {{k_, n_}};
+ const Eigen::array<TensorIndex, 2> sizeC = {{m_, n_}};
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC);
typedef typename Tensor<float, 2>::DimensionPair DimPair;
- const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+ const Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
@@ -249,14 +260,14 @@ template <typename Device> class BenchmarkSuite {
}
void convolution(int num_iters, int kernel_x, int kernel_y) {
- const Eigen::array<TensorIndex, 2> input_sizes(m_, n_);
+ const Eigen::array<TensorIndex, 2> input_sizes = {{m_, n_}};
TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes);
- const Eigen::array<TensorIndex, 2> kernel_sizes(kernel_x, kernel_y);
+ const Eigen::array<TensorIndex, 2> kernel_sizes = {{kernel_x, kernel_y}};
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes);
- const Eigen::array<TensorIndex, 2> result_sizes(
- m_ - kernel_x + 1, n_ - kernel_y + 1);
+ const Eigen::array<TensorIndex, 2> result_sizes =
+ {{m_ - kernel_x + 1, n_ - kernel_y + 1}};
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes);
- Eigen::array<Tensor<float, 2>::Index, 2> dims(0, 1);
+ Eigen::array<Tensor<float, 2>::Index, 2> dims = {{0, 1}};
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
@@ -280,7 +291,7 @@ template <typename Device> class BenchmarkSuite {
device_.memset(b_, 23, k_ * n_ * sizeof(float));
device_.memset(c_, 31, m_ * n_ * sizeof(float));
- BenchmarkUseRealTime();
+ //BenchmarkUseRealTime();
}
inline void finalizeBenchmark(int64 num_items) {
@@ -290,7 +301,7 @@ template <typename Device> class BenchmarkSuite {
}
#endif
StopBenchmarkTiming();
- SetBenchmarkItemsProcessed(num_items);
+ SetBenchmarkBytesProcessed(num_items);
}
diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc
index 68653ba15..248a63861 100644
--- a/bench/tensors/tensor_benchmarks_cpu.cc
+++ b/bench/tensors/tensor_benchmarks_cpu.cc
@@ -1,19 +1,12 @@
#define EIGEN_USE_THREADS
-#include "base/sysinfo.h"
-#include "strings/strcat.h"
-#include "third_party/eigen3/tensor_benchmarks.h"
-#include "thread/threadpool.h"
+#include <string>
+
+#include "tensor_benchmarks.h"
-#ifdef __ANDROID__
-#define CREATE_THREAD_POOL(threads) \
-Eigen::ThreadPoolDevice device(threads);
-#else
#define CREATE_THREAD_POOL(threads) \
-ThreadPool tp(threads); \
-tp.StartWorkers(); \
-Eigen::ThreadPoolDevice device(&tp, threads);
-#endif
+Eigen::ThreadPool pool(threads); \
+Eigen::ThreadPoolDevice device(&pool, threads);
// Simple functions
#define BM_FuncCPU(FUNC, THREADS) \
@@ -22,7 +15,6 @@ Eigen::ThreadPoolDevice device(&tp, threads);
CREATE_THREAD_POOL(THREADS); \
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
suite.FUNC(iters); \
- SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
@@ -84,7 +76,6 @@ BM_FuncCPU(reduction, 12);
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3); \
suite.FUNC(iters); \
} \
- SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
@@ -127,7 +118,6 @@ BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
CREATE_THREAD_POOL(THREADS); \
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
suite.FUNC(iters, DIM1, DIM2); \
- SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cc
index adea754ad..9fe8f84d9 100644
--- a/bench/tensors/tensor_benchmarks_gpu.cc
+++ b/bench/tensors/tensor_benchmarks_gpu.cc
@@ -3,10 +3,8 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
-#include "strings/strcat.h"
-#include "third_party/eigen3/tensor_benchmarks.h"
-
+#include "tensor_benchmarks.h"
// Simple functions
#define BM_FuncGPU(FUNC) \