From 279c7864090a7b96c34c3594e38ced35967c673f Mon Sep 17 00:00:00 2001 From: mtklein Date: Mon, 4 Jan 2016 19:13:19 -0800 Subject: If we swap its arguments, SkTaskGroup::batch() _is_ sk_parallel_for. Why have two names if we can get away with one? This kills off sk_parallel_for_thread_count(), which was only used to avoid forcing a deadlock in OncePtrTest on multicore machines in singlethreaded mode... a really niche use case. Instead just don't explicitly force a race. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1552093002 Review URL: https://codereview.chromium.org/1552093002 --- bench/SkGlyphCacheBench.cpp | 2 +- dm/DM.cpp | 5 +++-- samplecode/SamplePathFuzz.cpp | 2 +- src/core/SkMultiPictureDraw.cpp | 2 +- src/core/SkTaskGroup.cpp | 17 +++++---------- src/core/SkTaskGroup.h | 46 +---------------------------------------- tests/BlendTest.cpp | 2 +- tests/OncePtrTest.cpp | 11 ++-------- tests/OnceTest.cpp | 2 +- tests/PathOpsSkpClipTest.cpp | 2 +- tests/PathOpsThreadedCommon.cpp | 2 +- tests/SkSharedMutexTest.cpp | 2 +- tests/SkpSkGrTest.cpp | 2 +- 13 files changed, 20 insertions(+), 77 deletions(-) diff --git a/bench/SkGlyphCacheBench.cpp b/bench/SkGlyphCacheBench.cpp index c99b5728fd..2e0429fce3 100644 --- a/bench/SkGlyphCacheBench.cpp +++ b/bench/SkGlyphCacheBench.cpp @@ -95,7 +95,7 @@ protected: sk_tool_utils::create_portable_typeface("sans-serif", SkTypeface::kItalic)}; for (int work = 0; work < loops; work++) { - sk_parallel_for(16, [&](int threadIndex) { + SkTaskGroup().batch(16, [&](int threadIndex) { SkPaint paint; paint.setAntiAlias(true); paint.setSubpixelText(true); diff --git a/dm/DM.cpp b/dm/DM.cpp index ac37aeb09f..ace49076ac 100644 --- a/dm/DM.cpp +++ b/dm/DM.cpp @@ -1093,12 +1093,13 @@ int dm_main() { } SkTaskGroup tg; - tg.batch([](int i){ run_test(&gThreadedTests[i]); }, gThreadedTests.count()); + tg.batch(gThreadedTests.count(), [](int i){ run_test(&gThreadedTests[i]); }); for (int i = 0; i < kNumEnclaves; i++) { SkTArray* currentEnclave = &enclaves[i]; switch(i) { case kAnyThread_Enclave: - tg.batch([currentEnclave](int j) { Task::Run(&(*currentEnclave)[j]); }, currentEnclave->count()); + tg.batch(currentEnclave->count(), + [currentEnclave](int j) { Task::Run(&(*currentEnclave)[j]); }); break; case kGPU_Enclave: tg.add([currentEnclave](){ run_enclave_and_gpu_tests(currentEnclave); }); diff --git a/samplecode/SamplePathFuzz.cpp b/samplecode/SamplePathFuzz.cpp index 317833dfe9..d719826752 100644 --- a/samplecode/SamplePathFuzz.cpp +++ b/samplecode/SamplePathFuzz.cpp @@ -619,7 +619,7 @@ static bool contains_only_moveTo(const SkPath& path) { #include "SkTDArray.h" static void path_fuzz_stroker(SkBitmap* bitmap, int seed) { - sk_parallel_for(100, [&](int i) { + SkTaskGroup().batch(100, [&](int i) { int localSeed = seed + i; FuzzPath fuzzPath; diff --git a/src/core/SkMultiPictureDraw.cpp b/src/core/SkMultiPictureDraw.cpp index 672bd628b7..5745664858 100644 --- a/src/core/SkMultiPictureDraw.cpp +++ b/src/core/SkMultiPictureDraw.cpp @@ -94,7 +94,7 @@ void SkMultiPictureDraw::draw(bool flush) { fThreadSafeDrawData[i].draw(); } #else - sk_parallel_for(fThreadSafeDrawData.count(), [&](int i) { + SkTaskGroup().batch(fThreadSafeDrawData.count(), [&](int i) { fThreadSafeDrawData[i].draw(); }); #endif diff --git a/src/core/SkTaskGroup.cpp b/src/core/SkTaskGroup.cpp index e6b8532bb0..1799256d6f 100644 --- a/src/core/SkTaskGroup.cpp +++ b/src/core/SkTaskGroup.cpp @@ -54,12 +54,12 @@ public: gGlobal->add(fn, pending); } - static void Batch(std::function fn, int N, SkAtomic* pending) { + static void Batch(int N, std::function fn, SkAtomic* pending) { if (!gGlobal) { for (int i = 0; i < N; i++) { fn(i); } return; } - gGlobal->batch(fn, N, pending); + gGlobal->batch(N, fn, pending); } static void Wait(SkAtomic* pending) { @@ -142,7 +142,7 @@ private: fWorkAvailable.signal(1); } - void batch(std::function fn, int N, SkAtomic* pending) { + void batch(int N, std::function fn, SkAtomic* pending) { pending->fetch_add(+N, sk_memory_order_relaxed); // No barrier needed. { AutoLock lock(&fWorkLock); @@ -196,7 +196,6 @@ private: static ThreadPool* gGlobal; friend struct SkTaskGroup::Enabler; - friend int ::sk_parallel_for_thread_count(); }; ThreadPool* ThreadPool::gGlobal = nullptr; @@ -216,13 +215,7 @@ SkTaskGroup::SkTaskGroup() : fPending(0) {} void SkTaskGroup::wait() { ThreadPool::Wait(&fPending); } void SkTaskGroup::add(SkRunnable* task) { ThreadPool::Add(task, &fPending); } void SkTaskGroup::add(std::function fn) { ThreadPool::Add(fn, &fPending); } -void SkTaskGroup::batch (std::function fn, int N) { - ThreadPool::Batch(fn, N, &fPending); +void SkTaskGroup::batch(int N, std::function fn) { + ThreadPool::Batch(N, fn, &fPending); } -int sk_parallel_for_thread_count() { - if (ThreadPool::gGlobal != nullptr) { - return ThreadPool::gGlobal->fThreads.count(); - } - return 0; -} diff --git a/src/core/SkTaskGroup.h b/src/core/SkTaskGroup.h index d1daa44494..e6c36651fd 100644 --- a/src/core/SkTaskGroup.h +++ b/src/core/SkTaskGroup.h @@ -34,7 +34,7 @@ public: void add(std::function fn); // Add a batch of N tasks, all calling fn with different arguments. - void batch(std::function fn, int N); + void batch(int N, std::function fn); // Block until all Tasks previously add()ed to this SkTaskGroup have run. // You may safely reuse this SkTaskGroup after wait() returns. @@ -47,48 +47,4 @@ private: // Returns best estimate of number of CPU cores available to use. int sk_num_cores(); -int sk_parallel_for_thread_count(); - -// Call f(i) for i in [0, end). -template -void sk_parallel_for(int end, const Func& f) { - if (end <= 0) { return; } - - struct Chunk { - const Func* f; - int start, end; - }; - - // TODO(mtklein): this chunking strategy could probably use some tuning. - int max_chunks = sk_num_cores() * 2, - stride = (end + max_chunks - 1 ) / max_chunks, - nchunks = (end + stride - 1 ) / stride; - SkASSERT(nchunks <= max_chunks); - -#if defined(GOOGLE3) - // Stack frame size is limited in GOOGLE3. - SkAutoSTMalloc<512, Chunk> chunks(nchunks); -#else - // With the chunking strategy above this won't malloc until we have a machine with >512 cores. - SkAutoSTMalloc<1024, Chunk> chunks(nchunks); -#endif - - for (int i = 0; i < nchunks; i++) { - Chunk& c = chunks[i]; - c.f = &f; - c.start = i * stride; - c.end = SkTMin(c.start + stride, end); - SkASSERT(c.start < c.end); // Nothing will break if start >= end, but it's a wasted chunk. - } - - Chunk* chunkBase = chunks.get(); - auto run_chunk = [chunkBase](int i) { - Chunk& c = chunkBase[i]; - for (int i = c.start; i < c.end; i++) { - (*c.f)(i); - } - }; - SkTaskGroup().batch(run_chunk, nchunks); -} - #endif//SkTaskGroup_DEFINED diff --git a/tests/BlendTest.cpp b/tests/BlendTest.cpp index 518a7e241d..612492d271 100644 --- a/tests/BlendTest.cpp +++ b/tests/BlendTest.cpp @@ -98,5 +98,5 @@ DEF_TEST(Blend_premul_begets_premul, r) { }; // Parallelism helps speed things up on my desktop from ~725s to ~50s. - sk_parallel_for(SkXfermode::kLastMode, test_mode); + SkTaskGroup().batch(SkXfermode::kLastMode, test_mode); } diff --git a/tests/OncePtrTest.cpp b/tests/OncePtrTest.cpp index 103172751a..d01cee09fe 100644 --- a/tests/OncePtrTest.cpp +++ b/tests/OncePtrTest.cpp @@ -18,14 +18,7 @@ DEF_TEST(OncePtr, r) { return new int(5); }; - SkAtomic force_a_race(sk_parallel_for_thread_count()); - if (force_a_race < 1) { - return; - } - sk_parallel_for(sk_num_cores()*4, [&](size_t) { - force_a_race.fetch_add(-1); - while (force_a_race.load() > 0); - + SkTaskGroup().batch(sk_num_cores()*4, [&](size_t) { int* n = once.get(create); REPORTER_ASSERT(r, *n == 5); }); @@ -39,7 +32,7 @@ DEF_TEST(OnceNoPtr, r) { static SkAtomic calls(0); SkAtomic force_a_race(sk_num_cores()); - sk_parallel_for(sk_num_cores()*4, [&](size_t) { + SkTaskGroup().batch(sk_num_cores()*4, [&](size_t) { force_a_race.fetch_add(-1); while (force_a_race.load() > 0); diff --git a/tests/OnceTest.cpp b/tests/OnceTest.cpp index 35c2015166..3fd569a42a 100644 --- a/tests/OnceTest.cpp +++ b/tests/OnceTest.cpp @@ -32,7 +32,7 @@ SK_DECLARE_STATIC_ONCE(mt_once); DEF_TEST(SkOnce_Multithreaded, r) { int x = 0; // Run a bunch of tasks to be the first to add six to x. - sk_parallel_for(1021, [&](int) { + SkTaskGroup().batch(1021, [&](int) { void(*add_six)(int*) = [](int* p) { *p += 6; }; SkOnce(&mt_once, add_six, &x); }); diff --git a/tests/PathOpsSkpClipTest.cpp b/tests/PathOpsSkpClipTest.cpp index f82d75d036..e70e1c0c66 100644 --- a/tests/PathOpsSkpClipTest.cpp +++ b/tests/PathOpsSkpClipTest.cpp @@ -307,7 +307,7 @@ TestRunner::~TestRunner() { void TestRunner::render() { // TODO: this doesn't really need to use SkRunnables any more. // We can just write the code to run in the for-loop directly. - sk_parallel_for(fRunnables.count(), [&](int i) { + SkTaskGroup().batch(fRunnables.count(), [&](int i) { fRunnables[i]->run(); }); } diff --git a/tests/PathOpsThreadedCommon.cpp b/tests/PathOpsThreadedCommon.cpp index 342b560049..c9a06f0a52 100644 --- a/tests/PathOpsThreadedCommon.cpp +++ b/tests/PathOpsThreadedCommon.cpp @@ -16,7 +16,7 @@ PathOpsThreadedTestRunner::~PathOpsThreadedTestRunner() { } void PathOpsThreadedTestRunner::render() { - sk_parallel_for(fRunnables.count(), [&](int i) { + SkTaskGroup().batch(fRunnables.count(), [&](int i) { fRunnables[i]->run(); }); } diff --git a/tests/SkSharedMutexTest.cpp b/tests/SkSharedMutexTest.cpp index bdf072b6b7..845889174b 100644 --- a/tests/SkSharedMutexTest.cpp +++ b/tests/SkSharedMutexTest.cpp @@ -28,7 +28,7 @@ DEF_TEST(SkSharedMutexMultiThreaded, r) { for (int i = 0; i < kSharedSize; ++i) { shared[i] = 0; } - sk_parallel_for(8, [&](int threadIndex) { + SkTaskGroup().batch(8, [&](int threadIndex) { if (threadIndex % 4 != 0) { for (int c = 0; c < 100000; ++c) { sm.acquireShared(); diff --git a/tests/SkpSkGrTest.cpp b/tests/SkpSkGrTest.cpp index 98c54e032c..241395a753 100644 --- a/tests/SkpSkGrTest.cpp +++ b/tests/SkpSkGrTest.cpp @@ -171,7 +171,7 @@ SkpSkGrThreadedTestRunner::~SkpSkGrThreadedTestRunner() { void SkpSkGrThreadedTestRunner::render() { // TODO: we don't really need to be using SkRunnables here anymore. // We can just write the code we'd run right in the for loop. - sk_parallel_for(fRunnables.count(), [&](int i) { + SkTaskGroup().batch(fRunnables.count(), [&](int i) { fRunnables[i]->run(); }); } -- cgit v1.2.3