From 4f6e271596cf8aaa5fd90801c0b353d5eb7c9f78 Mon Sep 17 00:00:00 2001
From: Mike Klein <mtklein@chromium.org>
Date: Fri, 11 Aug 2017 10:37:35 -0400
Subject: ok, add a bench source
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This new source acts like other sources (GMs, SKPs) for benchmarks.  It
times multiple samples (controlled by samples=N, default 20), and each
of those samples uses the same strategy as monobench, growing loops
exponentially until it runs for at least 10ms.

When done it prints the fastest and the two slowest samples.  In
practice the 100th percentile sample is very different from the
next slowest due to caching, and the fastest is always interesting.

Because these benchmarks run in whatever execution engine ok has
selected, on non-Windows platforms you have some real control over the
interaction between benchmarks.  In its default "fork" mode each
benchmark runs independently in its own process, so the 100th
percentiles really stand out.  The other modes "thread" and "serial"
work as you'd expect too.

Here's an example where you can see how the different interactions work:

out/ok bench:samples=100 8888 filter:search=text_16_AA fork
    [text_16_AA_WT] 2.32µs  @0  6.23µs  @99 24.3ms  @100
    [text_16_AA_FF] 2.41µs  @0  5.7µs   @99 23.3ms  @100
    [text_16_AA_88] 2.55µs  @0  5.6µs   @99 24.8ms  @100
    [text_16_AA_BK] 1.97µs  @0  5.44µs  @99 23.2ms  @100

out/ok bench:samples=100 8888 filter:search=text_16_AA thread
    [text_16_AA_FF] 2.45µs  @0  23.5µs  @99 24.8ms  @100
    [text_16_AA_WT] 2.52µs  @0  17.8µs  @99 24.7ms  @100
    [text_16_AA_88] 2.55µs  @0  19.7µs  @99 25.1ms  @100
    [text_16_AA_BK] 1.8µs   @0  14.7µs  @99 25.1ms  @100

out/ok bench:samples=100 8888 filter:search=text_16_AA serial
    [text_16_AA_88] 2.35µs  @0  3.53µs  @99 16.7ms  @100
    [text_16_AA_FF] 2.09µs  @0  2.73µs  @99 2.91µs  @100
    [text_16_AA_BK] 1.75µs  @0  2.46µs  @99 2.65µs  @100
    [text_16_AA_WT] 2.1µs   @0  3.16µs  @99 3.17µs  @100

In the first "fork" case all runs are independent and have roughly
the same profile.  "thread" looks similar except you can see them
contending at the 99th percentile.  In "serial", the first bench
warms up the rest, so their 100th percentiles are all much faster.

Change-Id: I01a9f8c54b540221a9f232b271bb8ef3fda2569c
Reviewed-on: https://skia-review.googlesource.com/33585
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
---
 tools/ok_srcs.cpp | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 101 insertions(+), 2 deletions(-)

(limited to 'tools')
diff --git a/tools/ok_srcs.cpp b/tools/ok_srcs.cpp
index dc040f86c9..ae1c5d8ca7 100644
--- a/tools/ok_srcs.cpp
+++ b/tools/ok_srcs.cpp
@@ -5,11 +5,16 @@
  * found in the LICENSE file.
  */
 
-#include "ok.h"
-#include "gm.h"
+#include "Benchmark.h"
 #include "SkData.h"
 #include "SkOSFile.h"
 #include "SkPicture.h"
+#include "Timer.h"
+#include "gm.h"
+#include "ok.h"
+#include <algorithm>
+#include <chrono>
+#include <limits>
 #include <vector>
 
 struct GMStream : Stream {
@@ -112,3 +117,97 @@ struct SKPStream : Stream {
     }
 };
 static Register skp{"skp", "draw SKPs from dir=skps", SKPStream::Create};
+
+struct BenchStream : Stream {
+    const BenchRegistry* registry = BenchRegistry::Head();
+    int samples;
+
+    static std::unique_ptr<Stream> Create(Options options) {
+        BenchStream stream;
+        stream.samples = std::max(1, atoi(options("samples", "20").c_str()));
+        return move_unique(stream);
+    }
+
+    struct BenchSrc : Src {
+        Benchmark* (*factory)(void*);
+        std::unique_ptr<Benchmark> bench;
+        int samples;
+
+        void init() {
+            if (bench) { return; }
+            bench.reset(factory(nullptr));
+        }
+
+        std::string name() override {
+            this->init();
+            return bench->getName();
+        }
+
+        SkISize size() override {
+            this->init();
+            return { bench->getSize().x(), bench->getSize().y() };
+        }
+
+        Status draw(SkCanvas* canvas) override {
+            this->init();
+
+            using ms = std::chrono::duration<double, std::milli>;
+            std::vector<ms> sample(samples);
+
+            bench->delayedSetup();
+            if (canvas) {
+                bench->perCanvasPreDraw(canvas);
+            }
+            for (int i = 0; i < samples; i++) {
+                using clock = std::chrono::high_resolution_clock;
+                for (int loops = 1; loops < 1000000000; loops *= 2) {
+                    bench->preDraw(canvas);
+                    auto start = clock::now();
+                        bench->draw(loops, canvas);
+                    ms elapsed = clock::now() - start;
+                    bench->postDraw(canvas);
+
+                    if (elapsed.count() < 10) {
+                        continue;
+                    }
+
+                    sample[i] = elapsed / loops;
+                    break;
+                }
+            }
+            if (canvas) {
+                bench->perCanvasPostDraw(canvas);
+            }
+
+            std::sort(sample.begin(), sample.end());
+
+            SkString msg = SkStringPrintf("%s\t@0", HumanizeMs(sample[0].count()).c_str());
+            if (samples > 2) {
+                msg.appendf("\t%s\t@%g", HumanizeMs(sample[samples-2].count()).c_str()
+                                       , 100.0*(samples-1) / samples);
+            }
+            if (samples > 1) {
+                msg.appendf("\t%s\t@100", HumanizeMs(sample[samples-1].count()).c_str());
+            }
+            ok_log(msg.c_str());
+
+            return Status::OK;
+        }
+    };
+
+    std::unique_ptr<Src> next() override {
+        if (!registry) {
+            return nullptr;
+        }
+        BenchSrc src;
+        src.factory = registry->factory();
+        src.samples = samples;
+        registry = registry->next();
+        return move_unique(src);
+    }
+};
+static Register bench{
+    "bench",
+    "time benchmarks linked into this binary samples=20 times each",
+    BenchStream::Create,
+};
-- 
cgit v1.2.3