Add sk_parallel_for()

This should be a drop-in replacement for most for-loops to make them run in parallel: for (int i = 0; i < N; i++) { code... } ~~~> sk_parallel_for(N, [&](int i) { code... }); This is just syntax sugar over SkTaskGroup to make this use case really easy to write. There's no more overhead that we weren't already forced to add using an interface like batch(), and no extra heap allocations. I've replaced 3 uses of SkTaskGroup with sk_parallel_for: 1) My unit tests for SkOnce. 2) Cary's path fuzzer. 3) SkMultiPictureDraw. Performance should be the same. Please compare left and right for readability. :) BUG=skia: No public API changes. TBR=reed@google.com Review URL: https://codereview.chromium.org/1184373003
author: mtklein <mtklein@chromium.org> 2015-06-17 15:26:15 -0700
committer: Commit bot <commit-bot@chromium.org> 2015-06-17 15:26:15 -0700
commit: 00b621cfc0dac2a0028757a974de33a78bb8579d (patch)
tree: 696d1a4560d37af4929a6d0501611ceb88c0b45e /src/core/SkTaskGroup.h
parent: 5a9e2994c9915f76b1e3720f107e87fc952ffab2 (diff)
1 files changed, 39 insertions, 0 deletions
diff --git a/src/core/SkTaskGroup.h b/src/core/SkTaskGroup.h
index 8c7369da25..3af64d7753 100644
--- a/src/core/SkTaskGroup.h
+++ b/src/core/SkTaskGroup.h
@@ -10,6 +10,7 @@
 
 #include "SkTypes.h"
 #include "SkAtomics.h"
+#include "SkTemplates.h"
 
 struct SkRunnable;
 
@@ -49,4 +50,42 @@ private:
     SkAtomic<int32_t> fPending;
 };
 
+// Returns best estimate of number of CPU cores available to use.
+int sk_num_cores();
+
+// Call f(i) for i in [0, end).
+template <typename Func>
+void sk_parallel_for(int end, const Func& f) {
+    if (end <= 0) { return; }
+
+    struct Chunk {
+        const Func* f;
+        int start, end;
+    };
+
+    // TODO(mtklein): this chunking strategy could probably use some tuning.
+    int max_chunks  = sk_num_cores() * 2,
+        stride      = (end + max_chunks - 1 ) / max_chunks,
+        nchunks     = (end + stride - 1 ) / stride;
+    SkASSERT(nchunks <= max_chunks);
+
+    // With the chunking strategy above this won't malloc until we have a machine with >512 cores.
+    SkAutoSTMalloc<1024, Chunk> chunks(nchunks);
+
+    for (int i = 0; i < nchunks; i++) {
+        Chunk& c = chunks[i];
+        c.f     = &f;
+        c.start = i * stride;
+        c.end   = SkTMin(c.start + stride, end);
+        SkASSERT(c.start < c.end);  // Nothing will break if start >= end, but it's a wasted chunk.
+    }
+
+    void(*run_chunk)(Chunk*) = [](Chunk* c) {
+        for (int i = c->start; i < c->end; i++) {
+            (*c->f)(i);
+        }
+    };
+    SkTaskGroup().batch(run_chunk, chunks.get(), nchunks);
+}
+
 #endif//SkTaskGroup_DEFINED
author	mtklein <mtklein@chromium.org>	2015-06-17 15:26:15 -0700
committer	Commit bot <commit-bot@chromium.org>	2015-06-17 15:26:15 -0700
commit	00b621cfc0dac2a0028757a974de33a78bb8579d (patch)
tree	696d1a4560d37af4929a6d0501611ceb88c0b45e /src/core/SkTaskGroup.h
parent	5a9e2994c9915f76b1e3720f107e87fc952ffab2 (diff)