Add sk_parallel_for()

This should be a drop-in replacement for most for-loops to make them run in parallel: for (int i = 0; i < N; i++) { code... } ~~~> sk_parallel_for(N, [&](int i) { code... }); This is just syntax sugar over SkTaskGroup to make this use case really easy to write. There's no more overhead that we weren't already forced to add using an interface like batch(), and no extra heap allocations. I've replaced 3 uses of SkTaskGroup with sk_parallel_for: 1) My unit tests for SkOnce. 2) Cary's path fuzzer. 3) SkMultiPictureDraw. Performance should be the same. Please compare left and right for readability. :) BUG=skia: No public API changes. TBR=reed@google.com Review URL: https://codereview.chromium.org/1184373003
author: mtklein <mtklein@chromium.org> 2015-06-17 15:26:15 -0700
committer: Commit bot <commit-bot@chromium.org> 2015-06-17 15:26:15 -0700
commit: 00b621cfc0dac2a0028757a974de33a78bb8579d (patch)
tree: 696d1a4560d37af4929a6d0501611ceb88c0b45e
parent: 5a9e2994c9915f76b1e3720f107e87fc952ffab2 (diff)
12 files changed, 136 insertions, 164 deletions
diff --git a/include/core/SkMultiPictureDraw.h b/include/core/SkMultiPictureDraw.h
index 809e2dfe29..8d24e615e8 100644
--- a/include/core/SkMultiPictureDraw.h
+++ b/include/core/SkMultiPictureDraw.h
@@ -66,8 +66,6 @@ private:
         void draw();
 
         static void Reset(SkTDArray<DrawData>&);
-
-        static void Draw(DrawData* d) { d->draw(); }
     };
 
     SkTDArray<DrawData> fThreadSafeDrawData;
diff --git a/samplecode/SamplePathFuzz.cpp b/samplecode/SamplePathFuzz.cpp
index f2595c59ea..55f7f4129d 100644
--- a/samplecode/SamplePathFuzz.cpp
+++ b/samplecode/SamplePathFuzz.cpp
@@ -617,42 +617,31 @@ static bool contains_only_moveTo(const SkPath& path) {
 #include "SkTaskGroup.h"
 #include "SkTDArray.h"
 
-struct ThreadState {
-    int fSeed;
-    const SkBitmap* fBitmap;
-};
-
-static void test_fuzz(ThreadState* data) {
-    FuzzPath fuzzPath;
-    fuzzPath.setStrokeOnly();
-    fuzzPath.setSeed(data->fSeed);
-    fuzzPath.randomize();
-    const SkPath& path = fuzzPath.getPath();
-    const SkPaint& paint = fuzzPath.getPaint();
-    const SkImageInfo& info = data->fBitmap->info();
-    SkCanvas* canvas(SkCanvas::NewRasterDirect(info, data->fBitmap->getPixels(),
-            data->fBitmap->rowBytes()));
-    int w = info.width() / 4;
-    int h = info.height() / 4;
-    int x = data->fSeed / 4 % 4;
-    int y = data->fSeed % 4;
-    SkRect clipBounds = SkRect::MakeXYWH(SkIntToScalar(x) * w, SkIntToScalar(y) * h, 
-        SkIntToScalar(w), SkIntToScalar(h));
-    canvas->save();
-        canvas->clipRect(clipBounds);
-        canvas->translate(SkIntToScalar(x) * w, SkIntToScalar(y) * h);
-        canvas->drawPath(path, paint);
-    canvas->restore();
-}
-
 static void path_fuzz_stroker(SkBitmap* bitmap, int seed) {
-    ThreadState states[100];
-    for (size_t i = 0; i < SK_ARRAY_COUNT(states); i++) {
-        states[i].fSeed   = seed + (int) i;
-        states[i].fBitmap = bitmap;
-    }
-    SkTaskGroup tg;
-    tg.batch(test_fuzz, states, SK_ARRAY_COUNT(states));
+    sk_parallel_for(100, [&](int i) {
+        int localSeed = seed + i;
+
+        FuzzPath fuzzPath;
+        fuzzPath.setStrokeOnly();
+        fuzzPath.setSeed(localSeed);
+        fuzzPath.randomize();
+        const SkPath& path = fuzzPath.getPath();
+        const SkPaint& paint = fuzzPath.getPaint();
+        const SkImageInfo& info = bitmap->info();
+        SkCanvas* canvas(
+            SkCanvas::NewRasterDirect(info, bitmap->getPixels(), bitmap->rowBytes()));
+        int w = info.width() / 4;
+        int h = info.height() / 4;
+        int x = localSeed / 4 % 4;
+        int y = localSeed % 4;
+        SkRect clipBounds = SkRect::MakeXYWH(SkIntToScalar(x) * w, SkIntToScalar(y) * h,
+            SkIntToScalar(w), SkIntToScalar(h));
+        canvas->save();
+            canvas->clipRect(clipBounds);
+            canvas->translate(SkIntToScalar(x) * w, SkIntToScalar(y) * h);
+            canvas->drawPath(path, paint);
+        canvas->restore();
+    });
 }
 
 class PathFuzzView : public SampleView {
@@ -673,7 +662,7 @@ protected:
 
     void onOnceBeforeDraw() override {
         fIndex = 0;
-        SkImageInfo info(SkImageInfo::MakeN32Premul(SkScalarRoundToInt(width()), 
+        SkImageInfo info(SkImageInfo::MakeN32Premul(SkScalarRoundToInt(width()),
                 SkScalarRoundToInt(height())));
         offscreen.allocPixels(info);
         path_fuzz_stroker(&offscreen, fIndex);
diff --git a/src/core/SkMultiPictureDraw.cpp b/src/core/SkMultiPictureDraw.cpp
index 22fd05fb08..286d3b6489 100644
--- a/src/core/SkMultiPictureDraw.cpp
+++ b/src/core/SkMultiPictureDraw.cpp
@@ -91,17 +91,16 @@ void SkMultiPictureDraw::draw(bool flush) {
 
 #ifdef FORCE_SINGLE_THREAD_DRAWING_FOR_TESTING
     for (int i = 0; i < fThreadSafeDrawData.count(); ++i) {
-        DrawData* dd = &fThreadSafeDrawData.begin()[i];
-        dd->fCanvas->drawPicture(dd->fPicture, &dd->fMatrix, dd->fPaint);
+        fThreadSafeDrawData[i].draw();
     }
 #else
-    // we place the taskgroup after the MPDReset, to ensure that we don't delete the DrawData
-    // objects until after we're finished the tasks (which have pointers to the data).
-    SkTaskGroup group;
-    group.batch(DrawData::Draw, fThreadSafeDrawData.begin(), fThreadSafeDrawData.count());
+    sk_parallel_for(fThreadSafeDrawData.count(), [&](int i) {
+        fThreadSafeDrawData[i].draw();
+    });
 #endif
-    // we deliberately don't call wait() here, since the destructor will do that, this allows us
-    // to continue processing gpu-data without having to wait on the cpu tasks.
+
+    // N.B. we could get going on any GPU work from this main thread while the CPU work runs.
+    // But in practice, we've either got GPU work or CPU work, not both.
 
     const int count = fGPUDrawData.count();
     if (0 == count) {
diff --git a/src/core/SkTaskGroup.cpp b/src/core/SkTaskGroup.cpp
index 59319c148d..4504defb1b 100644
--- a/src/core/SkTaskGroup.cpp
+++ b/src/core/SkTaskGroup.cpp
@@ -5,6 +5,7 @@
  * found in the LICENSE file.
  */
 
+#include "SkOnce.h"
 #include "SkRunnable.h"
 #include "SkSemaphore.h"
 #include "SkSpinlock.h"
@@ -13,18 +14,27 @@
 #include "SkThreadUtils.h"
 
 #if defined(SK_BUILD_FOR_WIN32)
-    static inline int num_cores() {
+    static void query_num_cores(int* num_cores) {
         SYSTEM_INFO sysinfo;
         GetSystemInfo(&sysinfo);
-        return sysinfo.dwNumberOfProcessors;
+        *num_cores = sysinfo.dwNumberOfProcessors;
     }
 #else
     #include <unistd.h>
-    static inline int num_cores() {
-        return (int) sysconf(_SC_NPROCESSORS_ONLN);
+    static void query_num_cores(int* num_cores) {
+        *num_cores = (int)sysconf(_SC_NPROCESSORS_ONLN);
     }
 #endif
 
+// We cache sk_num_cores() so we only query the OS once.
+SK_DECLARE_STATIC_ONCE(g_query_num_cores_once);
+int sk_num_cores() {
+    static int num_cores = 0;
+    SkOnce(&g_query_num_cores_once, query_num_cores, &num_cores);
+    SkASSERT(num_cores > 0);
+    return num_cores;
+}
+
 namespace {
 
 class ThreadPool : SkNoncopyable {
@@ -98,7 +108,7 @@ private:
 
     explicit ThreadPool(int threads) {
         if (threads == -1) {
-            threads = num_cores();
+            threads = sk_num_cores();
         }
         for (int i = 0; i < threads; i++) {
             fThreads.push(SkNEW_ARGS(SkThread, (&ThreadPool::Loop, this)));
diff --git a/src/core/SkTaskGroup.h b/src/core/SkTaskGroup.h
index 8c7369da25..3af64d7753 100644
--- a/src/core/SkTaskGroup.h
+++ b/src/core/SkTaskGroup.h
@@ -10,6 +10,7 @@
 
 #include "SkTypes.h"
 #include "SkAtomics.h"
+#include "SkTemplates.h"
 
 struct SkRunnable;
 
@@ -49,4 +50,42 @@ private:
     SkAtomic<int32_t> fPending;
 };
 
+// Returns best estimate of number of CPU cores available to use.
+int sk_num_cores();
+
+// Call f(i) for i in [0, end).
+template <typename Func>
+void sk_parallel_for(int end, const Func& f) {
+    if (end <= 0) { return; }
+
+    struct Chunk {
+        const Func* f;
+        int start, end;
+    };
+
+    // TODO(mtklein): this chunking strategy could probably use some tuning.
+    int max_chunks  = sk_num_cores() * 2,
+        stride      = (end + max_chunks - 1 ) / max_chunks,
+        nchunks     = (end + stride - 1 ) / stride;
+    SkASSERT(nchunks <= max_chunks);
+
+    // With the chunking strategy above this won't malloc until we have a machine with >512 cores.
+    SkAutoSTMalloc<1024, Chunk> chunks(nchunks);
+
+    for (int i = 0; i < nchunks; i++) {
+        Chunk& c = chunks[i];
+        c.f     = &f;
+        c.start = i * stride;
+        c.end   = SkTMin(c.start + stride, end);
+        SkASSERT(c.start < c.end);  // Nothing will break if start >= end, but it's a wasted chunk.
+    }
+
+    void(*run_chunk)(Chunk*) = [](Chunk* c) {
+        for (int i = c->start; i < c->end; i++) {
+            (*c->f)(i);
+        }
+    };
+    SkTaskGroup().batch(run_chunk, chunks.get(), nchunks);
+}
+
 #endif//SkTaskGroup_DEFINED
diff --git a/tests/LazyPtrTest.cpp b/tests/LazyPtrTest.cpp
index 1b845bc502..89443f9a80 100644
--- a/tests/LazyPtrTest.cpp
+++ b/tests/LazyPtrTest.cpp
@@ -1,3 +1,10 @@
+/*
+ * Copyright 2014 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
 #include "Test.h"
 #include "SkLazyPtr.h"
 #include "SkRunnable.h"
@@ -44,37 +51,20 @@ DEF_TEST(LazyPtr, r) {
     SkDELETE(ptr);
 }
 
-namespace {
-
-struct Racer : public SkRunnable {
-    Racer() : fLazy(NULL), fSeen(NULL) {}
-
-    void run() override { fSeen = fLazy->get(); }
-
-    SkLazyPtr<int>* fLazy;
-    int* fSeen;
-};
-
-} // namespace
-
 DEF_TEST(LazyPtr_Threaded, r) {
     static const int kRacers = 321;
 
+    // Race to intialize the pointer by calling .get().
     SkLazyPtr<int> lazy;
+    int* seen[kRacers];
 
-    Racer racers[kRacers];
-    for (int i = 0; i < kRacers; i++) {
-        racers[i].fLazy = &lazy;
-    }
-
-    SkTaskGroup tg;
-    for (int i = 0; i < kRacers; i++) {
-        tg.add(racers + i);
-    }
-    tg.wait();
+    sk_parallel_for(kRacers, [&](int i) {
+        seen[i] = lazy.get();
+    });
 
+    // lazy.get() should return the same pointer to all threads.
     for (int i = 1; i < kRacers; i++) {
-        REPORTER_ASSERT(r, racers[i].fSeen);
-        REPORTER_ASSERT(r, racers[i].fSeen == racers[0].fSeen);
+        REPORTER_ASSERT(r, seen[i] != nullptr);
+        REPORTER_ASSERT(r, seen[i] == seen[0]);
     }
 }
diff --git a/tests/OnceTest.cpp b/tests/OnceTest.cpp
index 034c5d912f..35c2015166 100644
--- a/tests/OnceTest.cpp
+++ b/tests/OnceTest.cpp
@@ -28,42 +28,14 @@ DEF_TEST(SkOnce_Singlethreaded, r) {
     REPORTER_ASSERT(r, 5 == x);
 }
 
-static void add_six(int* x) {
-    *x += 6;
-}
-
-namespace {
-
-class Racer : public SkRunnable {
-public:
-    SkOnceFlag* once;
-    int* ptr;
-
-    void run() override {
-        SkOnce(once, add_six, ptr);
-    }
-};
-
-}  // namespace
-
 SK_DECLARE_STATIC_ONCE(mt_once);
 DEF_TEST(SkOnce_Multithreaded, r) {
-    const int kTasks = 16;
-
-    // Make a bunch of tasks that will race to be the first to add six to x.
-    Racer racers[kTasks];
     int x = 0;
-    for (int i = 0; i < kTasks; i++) {
-        racers[i].once = &mt_once;
-        racers[i].ptr = &x;
-    }
-
-    // Let them race.
-    SkTaskGroup tg;
-    for (int i = 0; i < kTasks; i++) {
-        tg.add(&racers[i]);
-    }
-    tg.wait();
+    // Run a bunch of tasks to be the first to add six to x.
+    sk_parallel_for(1021, [&](int) {
+        void(*add_six)(int*) = [](int* p) { *p += 6; };
+        SkOnce(&mt_once, add_six, &x);
+    });
 
     // Only one should have done the +=.
     REPORTER_ASSERT(r, 6 == x);
diff --git a/tests/PathOpsExtendedTest.cpp b/tests/PathOpsExtendedTest.cpp
index a8079cf0a9..df80f4673e 100644
--- a/tests/PathOpsExtendedTest.cpp
+++ b/tests/PathOpsExtendedTest.cpp
@@ -14,7 +14,6 @@
 #include "SkPaint.h"
 #include "SkRTConf.h"
 #include "SkStream.h"
-#include "SkTaskGroup.h"
 #include "SkThread.h"
 
 #ifdef SK_BUILD_FOR_MAC
diff --git a/tests/PathOpsSkpClipTest.cpp b/tests/PathOpsSkpClipTest.cpp
index 1f69ff05e1..7ea3184442 100755
--- a/tests/PathOpsSkpClipTest.cpp
+++ b/tests/PathOpsSkpClipTest.cpp
@@ -303,10 +303,11 @@ TestRunner::~TestRunner() {
 }
 
 void TestRunner::render() {
-    SkTaskGroup tg;
-    for (int index = 0; index < fRunnables.count(); ++ index) {
-        tg.add(fRunnables[index]);
-    }
+    // TODO: this doesn't really need to use SkRunnables any more.
+    // We can just write the code to run in the for-loop directly.
+    sk_parallel_for(fRunnables.count(), [&](int i) {
+        fRunnables[i]->run();
+    });
 }
 
 ////////////////////////////////////////////////
diff --git a/tests/PathOpsThreadedCommon.cpp b/tests/PathOpsThreadedCommon.cpp
index 0adde915e0..10501d139c 100644
--- a/tests/PathOpsThreadedCommon.cpp
+++ b/tests/PathOpsThreadedCommon.cpp
@@ -16,8 +16,7 @@ PathOpsThreadedTestRunner::~PathOpsThreadedTestRunner() {
 }
 
 void PathOpsThreadedTestRunner::render() {
-    SkTaskGroup tg;
-    for (int index = 0; index < fRunnables.count(); ++ index) {
-        tg.add(fRunnables[index]);
-    }
+    sk_parallel_for(fRunnables.count(), [&](int i) {
+        fRunnables[i]->run();
+    });
 }
diff --git a/tests/SkpSkGrTest.cpp b/tests/SkpSkGrTest.cpp
index 2c3c5b7fbf..212b0f6abd 100644
--- a/tests/SkpSkGrTest.cpp
+++ b/tests/SkpSkGrTest.cpp
@@ -1,6 +1,9 @@
-#if !SK_SUPPORT_GPU
-#error "GPU support required"
-#endif
+/*
+ * Copyright 2013 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
 
 #include "GrContext.h"
 #include "GrContextFactory.h"
@@ -27,6 +30,10 @@
 #include "SkTime.h"
 #include "Test.h"
 
+#if !SK_SUPPORT_GPU
+#error "GPU support required"
+#endif
+
 #ifdef SK_BUILD_FOR_WIN
     #define PATH_SLASH "\\"
     #define IN_DIR "D:\\9-30-13\\"
@@ -162,10 +169,11 @@ SkpSkGrThreadedTestRunner::~SkpSkGrThreadedTestRunner() {
 }
 
 void SkpSkGrThreadedTestRunner::render() {
-    SkTaskGroup tg;
-    for (int index = 0; index < fRunnables.count(); ++ index) {
-        tg.add(fRunnables[index]);
-    }
+    // TODO: we don't really need to be using SkRunnables here anymore.
+    // We can just write the code we'd run right in the for loop.
+    sk_parallel_for(fRunnables.count(), [&](int i) {
+        fRunnables[i]->run();
+    });
 }
 
 ////////////////////////////////////////////////
diff --git a/tools/skpdiff/SkDiffContext.cpp b/tools/skpdiff/SkDiffContext.cpp
index c422636f6e..d14546046c 100644
--- a/tools/skpdiff/SkDiffContext.cpp
+++ b/tools/skpdiff/SkDiffContext.cpp
@@ -209,26 +209,6 @@ void SkDiffContext::addDiff(const char* baselinePath, const char* testPath) {
     }
 }
 
-class SkThreadedDiff : public SkRunnable {
-public:
-    SkThreadedDiff() : fDiffContext(NULL) { }
-
-    void setup(SkDiffContext* diffContext, const SkString& baselinePath, const SkString& testPath) {
-        fDiffContext = diffContext;
-        fBaselinePath = baselinePath;
-        fTestPath = testPath;
-    }
-
-    void run() override {
-        fDiffContext->addDiff(fBaselinePath.c_str(), fTestPath.c_str());
-    }
-
-private:
-    SkDiffContext* fDiffContext;
-    SkString fBaselinePath;
-    SkString fTestPath;
-};
-
 void SkDiffContext::diffDirectories(const char baselinePath[], const char testPath[]) {
     // Get the files in the baseline, we will then look for those inside the test path
     SkTArray<SkString> baselineEntries;
@@ -237,12 +217,8 @@ void SkDiffContext::diffDirectories(const char baselinePath[], const char testPa
         return;
     }
 
-    SkTaskGroup tg;
-    SkTArray<SkThreadedDiff> runnableDiffs;
-    runnableDiffs.reset(baselineEntries.count());
-
-    for (int x = 0; x < baselineEntries.count(); x++) {
-        const char* baseFilename = baselineEntries[x].c_str();
+    sk_parallel_for(baselineEntries.count(), [&](int i) {
+        const char* baseFilename = baselineEntries[i].c_str();
 
         // Find the real location of each file to compare
         SkString baselineFile = SkOSPath::Join(baselinePath, baseFilename);
@@ -250,13 +226,11 @@ void SkDiffContext::diffDirectories(const char baselinePath[], const char testPa
 
         // Check that the test file exists and is a file
         if (sk_exists(testFile.c_str()) && !sk_isdir(testFile.c_str())) {
-            // Queue up the comparison with the differ
-            runnableDiffs[x].setup(this, baselineFile, testFile);
-            tg.add(&runnableDiffs[x]);
+            this->addDiff(baselineFile.c_str(), testFile.c_str());
         } else {
             SkDebugf("Baseline file \"%s\" has no corresponding test file\n", baselineFile.c_str());
         }
-    }
+    });
 }
 
 
@@ -281,15 +255,9 @@ void SkDiffContext::diffPatterns(const char baselinePattern[], const char testPa
         return;
     }
 
-    SkTaskGroup tg;
-    SkTArray<SkThreadedDiff> runnableDiffs;
-    runnableDiffs.reset(baselineEntries.count());
-
-    for (int x = 0; x < baselineEntries.count(); x++) {
-        runnableDiffs[x].setup(this, baselineEntries[x], testEntries[x]);
-        tg.add(&runnableDiffs[x]);
-    }
-    tg.wait();
+    sk_parallel_for(baselineEntries.count(), [&](int i) {
+        this->addDiff(baselineEntries[i].c_str(), testEntries[i].c_str());
+    });
 }
 
 void SkDiffContext::outputRecords(SkWStream& stream, bool useJSONP) {
author	mtklein <mtklein@chromium.org>	2015-06-17 15:26:15 -0700
committer	Commit bot <commit-bot@chromium.org>	2015-06-17 15:26:15 -0700
commit	00b621cfc0dac2a0028757a974de33a78bb8579d (patch)
tree	696d1a4560d37af4929a6d0501611ceb88c0b45e
parent	5a9e2994c9915f76b1e3720f107e87fc952ffab2 (diff)