From c4e47630b16a716d01dc20b36afa8882b03681a1 Mon Sep 17 00:00:00 2001 From: Yangqing Jia Date: Thu, 28 Jan 2016 10:35:14 -0800 Subject: benchmark modifications to make it compilable in a standalone fashion. --- bench/tensors/benchmark.h | 48 ++++++++ bench/tensors/benchmark_main.cc | 215 +++++++++++++++++++++++++++++++++ bench/tensors/tensor_benchmarks.h | 87 +++++++------ bench/tensors/tensor_benchmarks_cpu.cc | 20 +-- bench/tensors/tensor_benchmarks_gpu.cc | 4 +- 5 files changed, 318 insertions(+), 56 deletions(-) create mode 100644 bench/tensors/benchmark.h create mode 100644 bench/tensors/benchmark_main.cc (limited to 'bench') diff --git a/bench/tensors/benchmark.h b/bench/tensors/benchmark.h new file mode 100644 index 000000000..d8b4fd4c6 --- /dev/null +++ b/bench/tensors/benchmark.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +namespace testing { +class Benchmark { + public: + Benchmark(const char* name, void (*fn)(int)) { + Register(name, fn, NULL); + } + Benchmark(const char* name, void (*fn_range)(int, int)) { + Register(name, NULL, fn_range); + } + Benchmark* Arg(int x); + Benchmark* Range(int lo, int hi); + const char* Name(); + bool ShouldRun(int argc, char* argv[]); + void Run(); + private: + const char* name_; + void (*fn_)(int); + void (*fn_range_)(int, int); + std::vector args_; + void Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)); + void RunRepeatedlyWithArg(int iterations, int arg); + void RunWithArg(int arg); +}; +} // namespace testing +void SetBenchmarkBytesProcessed(int64_t); +void StopBenchmarkTiming(); +void StartBenchmarkTiming(); +#define BENCHMARK(f) \ + static ::testing::Benchmark* _benchmark_##f __attribute__((unused)) = \ + (new ::testing::Benchmark(#f, f)) \ No newline at end of file diff --git a/bench/tensors/benchmark_main.cc b/bench/tensors/benchmark_main.cc new file mode 100644 index 000000000..0fc12960e --- /dev/null +++ b/bench/tensors/benchmark_main.cc @@ -0,0 +1,215 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "benchmark.h" +#include +#include +#include +#include +#include +#include +#include + +static int64_t g_bytes_processed; +static int64_t g_benchmark_total_time_ns; +static int64_t g_benchmark_start_time_ns; +typedef std::map BenchmarkMap; +typedef BenchmarkMap::iterator BenchmarkMapIt; +static BenchmarkMap g_benchmarks; +static int g_name_column_width = 20; +static int Round(int n) { + int base = 1; + while (base*10 < n) { + base *= 10; + } + if (n < 2*base) { + return 2*base; + } + if (n < 5*base) { + return 5*base; + } + return 10*base; +} +static int64_t NanoTime() { + struct timespec t; + t.tv_sec = t.tv_nsec = 0; + clock_gettime(CLOCK_MONOTONIC, &t); + return static_cast(t.tv_sec) * 1000000000LL + t.tv_nsec; +} +namespace testing { +Benchmark* Benchmark::Arg(int arg) { + args_.push_back(arg); + return this; +} + +Benchmark* Benchmark::Range(int lo, int hi) { + const int kRangeMultiplier = 8; + if (hi < lo) { + int temp = hi; + hi = lo; + lo = temp; + } + while (lo < hi) { + args_.push_back(lo); + lo *= kRangeMultiplier; + } + // We always run the hi number. + args_.push_back(hi); + return this; +} + +const char* Benchmark::Name() { + return name_; +} +bool Benchmark::ShouldRun(int argc, char* argv[]) { + if (argc == 1) { + return true; // With no arguments, we run all benchmarks. + } + // Otherwise, we interpret each argument as a regular expression and + // see if any of our benchmarks match. + for (int i = 1; i < argc; i++) { + regex_t re; + if (regcomp(&re, argv[i], 0) != 0) { + fprintf(stderr, "couldn't compile \"%s\" as a regular expression!\n", argv[i]); + exit(EXIT_FAILURE); + } + int match = regexec(&re, name_, 0, NULL, 0); + regfree(&re); + if (match != REG_NOMATCH) { + return true; + } + } + return false; +} +void Benchmark::Register(const char* name, void (*fn)(int), void (*fn_range)(int, int)) { + name_ = name; + fn_ = fn; + fn_range_ = fn_range; + if (fn_ == NULL && fn_range_ == NULL) { + fprintf(stderr, "%s: missing function\n", name_); + exit(EXIT_FAILURE); + } + g_benchmarks.insert(std::make_pair(name, this)); +} +void Benchmark::Run() { + if (fn_ != NULL) { + RunWithArg(0); + } else { + if (args_.empty()) { + fprintf(stderr, "%s: no args!\n", name_); + exit(EXIT_FAILURE); + } + for (size_t i = 0; i < args_.size(); ++i) { + RunWithArg(args_[i]); + } + } +} +void Benchmark::RunRepeatedlyWithArg(int iterations, int arg) { + g_bytes_processed = 0; + g_benchmark_total_time_ns = 0; + g_benchmark_start_time_ns = NanoTime(); + if (fn_ != NULL) { + fn_(iterations); + } else { + fn_range_(iterations, arg); + } + if (g_benchmark_start_time_ns != 0) { + g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns; + } +} +void Benchmark::RunWithArg(int arg) { + // run once in case it's expensive + int iterations = 1; + RunRepeatedlyWithArg(iterations, arg); + while (g_benchmark_total_time_ns < 1e9 && iterations < 1e9) { + int last = iterations; + if (g_benchmark_total_time_ns/iterations == 0) { + iterations = 1e9; + } else { + iterations = 1e9 / (g_benchmark_total_time_ns/iterations); + } + iterations = std::max(last + 1, std::min(iterations + iterations/2, 100*last)); + iterations = Round(iterations); + RunRepeatedlyWithArg(iterations, arg); + } + char throughput[100]; + throughput[0] = '\0'; + if (g_benchmark_total_time_ns > 0 && g_bytes_processed > 0) { + double mib_processed = static_cast(g_bytes_processed)/1e6; + double seconds = static_cast(g_benchmark_total_time_ns)/1e9; + snprintf(throughput, sizeof(throughput), " %8.2f MiB/s", mib_processed/seconds); + } + char full_name[100]; + if (fn_range_ != NULL) { + if (arg >= (1<<20)) { + snprintf(full_name, sizeof(full_name), "%s/%dM", name_, arg/(1<<20)); + } else if (arg >= (1<<10)) { + snprintf(full_name, sizeof(full_name), "%s/%dK", name_, arg/(1<<10)); + } else { + snprintf(full_name, sizeof(full_name), "%s/%d", name_, arg); + } + } else { + snprintf(full_name, sizeof(full_name), "%s", name_); + } + printf("%-*s %10d %10" PRId64 "%s\n", g_name_column_width, full_name, + iterations, g_benchmark_total_time_ns/iterations, throughput); + fflush(stdout); +} +} // namespace testing +void SetBenchmarkBytesProcessed(int64_t x) { + g_bytes_processed = x; +} +void StopBenchmarkTiming() { + if (g_benchmark_start_time_ns != 0) { + g_benchmark_total_time_ns += NanoTime() - g_benchmark_start_time_ns; + } + g_benchmark_start_time_ns = 0; +} +void StartBenchmarkTiming() { + if (g_benchmark_start_time_ns == 0) { + g_benchmark_start_time_ns = NanoTime(); + } +} +int main(int argc, char* argv[]) { + if (g_benchmarks.empty()) { + fprintf(stderr, "No benchmarks registered!\n"); + exit(EXIT_FAILURE); + } + for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + int name_width = static_cast(strlen(it->second->Name())); + g_name_column_width = std::max(g_name_column_width, name_width); + } + bool need_header = true; + for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + ::testing::Benchmark* b = it->second; + if (b->ShouldRun(argc, argv)) { + if (need_header) { + printf("%-*s %10s %10s\n", g_name_column_width, "", "iterations", "ns/op"); + fflush(stdout); + need_header = false; + } + b->Run(); + } + } + if (need_header) { + fprintf(stderr, "No matching benchmarks!\n"); + fprintf(stderr, "Available benchmarks:\n"); + for (BenchmarkMapIt it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + fprintf(stderr, " %s\n", it->second->Name()); + } + exit(EXIT_FAILURE); + } + return 0; +} \ No newline at end of file diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index 525b9acda..a1696afda 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -4,12 +4,23 @@ typedef int TensorIndex; #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "testing/base/public/benchmark.h" +#include "unsupported/Eigen/CXX11/Tensor" +#include "benchmark.h" + +#define BENCHMARK_RANGE(bench, lo, hi) \ + BENCHMARK(bench)->Range(lo, hi) + +template +std::string StrCat(const Args... args) { + std::stringstream ss; + StrCatRecursive(ss, args...); + return ss.str(); +} using Eigen::Tensor; using Eigen::TensorMap; +typedef int64_t int64; // TODO(bsteiner): also templatize on the input type since we have users // for int8 as well as floats. @@ -43,7 +54,7 @@ template class BenchmarkSuite { void random(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes(m_, m_); + const Eigen::array sizes = {{m_, m_}}; TensorMap, Eigen::Aligned> C(c_, sizes); StartBenchmarkTiming(); @@ -56,16 +67,16 @@ template class BenchmarkSuite { void slicing(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes(m_, m_); + const Eigen::array sizes = {{m_, m_}}; const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); - const Eigen::DSizes quarter_sizes(Eigen::array(m_/2, m_/2)); - const Eigen::DSizes first_quadrant(Eigen::array(0, 0)); - const Eigen::DSizes second_quadrant(Eigen::array(0, m_/2)); - const Eigen::DSizes third_quadrant(Eigen::array(m_/2, 0)); - const Eigen::DSizes fourth_quadrant(Eigen::array(m_/2, m_/2)); + const Eigen::DSizes quarter_sizes(m_/2, m_/2); + const Eigen::DSizes first_quadrant(0, 0); + const Eigen::DSizes second_quadrant(0, m_/2); + const Eigen::DSizes third_quadrant(m_/2, 0); + const Eigen::DSizes fourth_quadrant(m_/2, m_/2); StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -85,12 +96,12 @@ template class BenchmarkSuite { void shuffling(int num_iters) { eigen_assert(m_ == n_); - const Eigen::array size_a(m_, k_); + const Eigen::array size_a = {{m_, k_}}; const TensorMap, Eigen::Aligned> A(a_, size_a); - const Eigen::array size_b(k_, m_); + const Eigen::array size_b = {{k_, m_}}; TensorMap, Eigen::Aligned> B(b_, size_b); - const Eigen::array shuffle(1, 0); + const Eigen::array shuffle = {{1, 0}}; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -102,9 +113,9 @@ template class BenchmarkSuite { void padding(int num_iters) { eigen_assert(m_ == k_); - const Eigen::array size_a(m_, k_-3); + const Eigen::array size_a = {{m_, k_-3}}; const TensorMap, Eigen::Aligned> A(a_, size_a); - const Eigen::array size_b(k_, m_); + const Eigen::array size_b = {{k_, m_}}; TensorMap, Eigen::Aligned> B(b_, size_b); Eigen::array, 2> paddings; @@ -121,12 +132,12 @@ template class BenchmarkSuite { void striding(int num_iters) { eigen_assert(m_ == k_); - const Eigen::array size_a(m_, k_); + const Eigen::array size_a = {{m_, k_}}; const TensorMap, Eigen::Aligned> A(a_, size_a); - const Eigen::array size_b(m_, k_ / 2); + const Eigen::array size_b = {{m_, k_ / 2}}; TensorMap, Eigen::Aligned> B(b_, size_b); - const Eigen::array strides(1, 2); + const Eigen::array strides = {{1, 2}}; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -137,14 +148,14 @@ template class BenchmarkSuite { } void broadcasting(int num_iters) { - const Eigen::array size_a(m_, 1); + const Eigen::array size_a = {{m_, 1}}; const TensorMap, Eigen::Aligned> A(a_, size_a); - const Eigen::array size_c(m_, n_); + const Eigen::array size_c = {{m_, n_}}; TensorMap, Eigen::Aligned> C(c_, size_c); -#if defined(__CUDACC__) +#ifndef EIGEN_HAS_INDEX_LIST // nvcc doesn't support cxx11 - const Eigen::array broadcast(1, n_); + const Eigen::array broadcast = {{1, n_}}; #else // Take advantage of cxx11 to give the compiler information it can use to // optimize the code. @@ -162,7 +173,7 @@ template class BenchmarkSuite { void coeffWiseOp(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes(m_, m_); + const Eigen::array sizes = {{m_, m_}}; const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); @@ -178,7 +189,7 @@ template class BenchmarkSuite { void algebraicFunc(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes(m_, m_); + const Eigen::array sizes = {{m_, m_}}; const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); @@ -194,7 +205,7 @@ template class BenchmarkSuite { void transcendentalFunc(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); - const Eigen::array sizes(m_, m_); + const Eigen::array sizes = {{m_, m_}}; const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); @@ -210,12 +221,12 @@ template class BenchmarkSuite { // Simple reduction void reduction(int num_iters) { - const Eigen::array input_size(k_, n_); + const Eigen::array input_size = {{k_, n_}}; const TensorMap, Eigen::Aligned> B(b_, input_size); - const Eigen::array output_size(n_); + const Eigen::array output_size = {{n_}}; TensorMap, Eigen::Aligned> C(c_, output_size); - const Eigen::array sum_along_dim(0); + const Eigen::array sum_along_dim = {{0}}; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -228,16 +239,16 @@ template class BenchmarkSuite { // do a contraction which is equivalent to a matrix multiplication void contraction(int num_iters) { - const Eigen::array sizeA(m_, k_); - const Eigen::array sizeB(k_, n_); - const Eigen::array sizeC(m_, n_); + const Eigen::array sizeA = {{m_, k_}}; + const Eigen::array sizeB = {{k_, n_}}; + const Eigen::array sizeC = {{m_, n_}}; const TensorMap, Eigen::Aligned> A(a_, sizeA); const TensorMap, Eigen::Aligned> B(b_, sizeB); TensorMap, Eigen::Aligned> C(c_, sizeC); typedef typename Tensor::DimensionPair DimPair; - const Eigen::array dims(DimPair(1, 0)); + const Eigen::array dims = {{DimPair(1, 0)}}; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -249,14 +260,14 @@ template class BenchmarkSuite { } void convolution(int num_iters, int kernel_x, int kernel_y) { - const Eigen::array input_sizes(m_, n_); + const Eigen::array input_sizes = {{m_, n_}}; TensorMap, Eigen::Aligned> A(a_, input_sizes); - const Eigen::array kernel_sizes(kernel_x, kernel_y); + const Eigen::array kernel_sizes = {{kernel_x, kernel_y}}; TensorMap, Eigen::Aligned> B(b_, kernel_sizes); - const Eigen::array result_sizes( - m_ - kernel_x + 1, n_ - kernel_y + 1); + const Eigen::array result_sizes = + {{m_ - kernel_x + 1, n_ - kernel_y + 1}}; TensorMap, Eigen::Aligned> C(c_, result_sizes); - Eigen::array::Index, 2> dims(0, 1); + Eigen::array::Index, 2> dims = {{0, 1}}; StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { @@ -280,7 +291,7 @@ template class BenchmarkSuite { device_.memset(b_, 23, k_ * n_ * sizeof(float)); device_.memset(c_, 31, m_ * n_ * sizeof(float)); - BenchmarkUseRealTime(); + //BenchmarkUseRealTime(); } inline void finalizeBenchmark(int64 num_items) { @@ -290,7 +301,7 @@ template class BenchmarkSuite { } #endif StopBenchmarkTiming(); - SetBenchmarkItemsProcessed(num_items); + SetBenchmarkBytesProcessed(num_items); } diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc index 68653ba15..248a63861 100644 --- a/bench/tensors/tensor_benchmarks_cpu.cc +++ b/bench/tensors/tensor_benchmarks_cpu.cc @@ -1,19 +1,12 @@ #define EIGEN_USE_THREADS -#include "base/sysinfo.h" -#include "strings/strcat.h" -#include "third_party/eigen3/tensor_benchmarks.h" -#include "thread/threadpool.h" +#include + +#include "tensor_benchmarks.h" -#ifdef __ANDROID__ -#define CREATE_THREAD_POOL(threads) \ -Eigen::ThreadPoolDevice device(threads); -#else #define CREATE_THREAD_POOL(threads) \ -ThreadPool tp(threads); \ -tp.StartWorkers(); \ -Eigen::ThreadPoolDevice device(&tp, threads); -#endif +Eigen::ThreadPool pool(threads); \ +Eigen::ThreadPoolDevice device(&pool, threads); // Simple functions #define BM_FuncCPU(FUNC, THREADS) \ @@ -22,7 +15,6 @@ Eigen::ThreadPoolDevice device(&tp, threads); CREATE_THREAD_POOL(THREADS); \ BenchmarkSuite suite(device, N); \ suite.FUNC(iters); \ - SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000); @@ -84,7 +76,6 @@ BM_FuncCPU(reduction, 12); BenchmarkSuite suite(device, D1, D2, D3); \ suite.FUNC(iters); \ } \ - SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000); @@ -127,7 +118,6 @@ BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16); CREATE_THREAD_POOL(THREADS); \ BenchmarkSuite suite(device, N); \ suite.FUNC(iters, DIM1, DIM2); \ - SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ } \ BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000); diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cc index adea754ad..9fe8f84d9 100644 --- a/bench/tensors/tensor_benchmarks_gpu.cc +++ b/bench/tensors/tensor_benchmarks_gpu.cc @@ -3,10 +3,8 @@ #include #include #include -#include "strings/strcat.h" -#include "third_party/eigen3/tensor_benchmarks.h" - +#include "tensor_benchmarks.h" // Simple functions #define BM_FuncGPU(FUNC) \ -- cgit v1.2.3