Revert of SkThreadPool ~~> SkTaskGroup (patchset #4 id:60001 of https://codereview.chromium.org/531653002/)

Reason for revert: Leaks, leaks, leaks. Original issue's description: > SkThreadPool ~~> SkTaskGroup > > SkTaskGroup is like SkThreadPool except the threads stay in > one global pool. Each SkTaskGroup itself is tiny (4 bytes) > and its wait() method applies only to tasks add()ed to that > instance, not the whole thread pool. > > This means we don't need to bring up new thread pools when > tests themselves want to use multithreading (e.g. pathops, > quilt). We just create a new SkTaskGroup and wait for that > to complete. This should be more efficient, and allow us > to expand where we use threads to really latency sensitive > places. E.g. we can probably now use these in nanobench > for CPU .skp rendering. > > Now that all threads are sharing the same pool, I think we > can remove most of the custom mechanism pathops tests use > to control threading. They'll just ride on the global pool > with all other tests now. > > This (temporarily?) removes the GPU multithreading feature > from DM, which we don't use. > > On my desktop, DM runs a little faster (57s -> 55s) in > Debug, and a lot faster in Release (36s -> 24s). The bots > show speedups of similar proportions, cutting more than a > minute off the N4/Release and Win7/Debug runtimes. > > BUG=skia: > > Committed: https://skia.googlesource.com/skia/+/9c7207b5dc71dc5a96a2eb107d401133333d5b6f R=caryclark@google.com, bsalomon@google.com, bungeman@google.com, reed@google.com, mtklein@chromium.org TBR=bsalomon@google.com, bungeman@google.com, caryclark@google.com, mtklein@chromium.org, reed@google.com NOTREECHECKS=true NOTRY=true BUG=skia: Author: mtklein@google.com Review URL: https://codereview.chromium.org/533393002
author: mtklein <mtklein@google.com> 2014-09-03 14:17:48 -0700
committer: Commit bot <commit-bot@chromium.org> 2014-09-03 14:17:48 -0700
commit: 2460bbdfbb1d55ef307c3189c661e65de1a7affb (patch)
tree: 62342a335023875d1482447bea9e29e8a0ba22fb
parent: 9c7207b5dc71dc5a96a2eb107d401133333d5b6f (diff)
42 files changed, 383 insertions, 294 deletions
diff --git a/dm/DM.cpp b/dm/DM.cpp
index 9d111eac30..09f0ce8f26 100644
--- a/dm/DM.cpp
+++ b/dm/DM.cpp
@@ -41,6 +41,7 @@ using skiatest::TestRegistry;
 static const char kGpuAPINameGL[] = "gl";
 static const char kGpuAPINameGLES[] = "gles";
 
+DEFINE_int32(gpuThreads, 1, "Threads for GPU work.");
 DEFINE_string2(expectations, r, "",
                "If a directory, compare generated images against images under this path. "
                "If a file, compare generated images against JSON expectations at this path."
@@ -232,7 +233,7 @@ int dm_main() {
     SkDebugf("%d GMs x %d configs, %d tests, %d pictures\n",
              gms.count(), configs.count(), tests.count(), skps.count());
     DM::Reporter reporter;
-    DM::TaskRunner tasks(FLAGS_threads);
+    DM::TaskRunner tasks(FLAGS_threads, FLAGS_gpuThreads);
     kick_off_tests(tests, &reporter, &tasks);
     kick_off_gms(gms, configs, gpuAPI, *expectations, &reporter, &tasks);
     kick_off_skps(skps, &reporter, &tasks);
diff --git a/dm/DMQuiltTask.cpp b/dm/DMQuiltTask.cpp
index a824de0f15..6961f09671 100644
--- a/dm/DMQuiltTask.cpp
+++ b/dm/DMQuiltTask.cpp
@@ -5,7 +5,7 @@
 #include "SkBBHFactory.h"
 #include "SkCommandLineFlags.h"
 #include "SkPicture.h"
-#include "SkTaskGroup.h"
+#include "SkThreadPool.h"
 
 DEFINE_bool(quilt, true, "If true, draw GM via a picture into a quilt of small tiles and compare.");
 DEFINE_int32(quiltTile, 256, "Dimension of (square) quilt tile.");
@@ -96,11 +96,11 @@ void QuiltTask::draw() {
         canvas.flush();
     } else {
         // Draw tiles in parallel into the same bitmap, simulating aggressive impl-side painting.
-        SkTaskGroup tg;
+        SkThreadPool pool(SkThreadPool::kThreadPerCore);
         for (int y = 0; y < tiles_needed(full.height(), FLAGS_quiltTile); y++) {
             for (int x = 0; x < tiles_needed(full.width(), FLAGS_quiltTile); x++) {
                 // Deletes itself when done.
-                tg.add(new Tile(x, y, *recorded, &full));
+                pool.add(new Tile(x, y, *recorded, &full));
             }
         }
     }
diff --git a/dm/DMTask.cpp b/dm/DMTask.cpp
index 5ceb53b062..05eda4ea20 100644
--- a/dm/DMTask.cpp
+++ b/dm/DMTask.cpp
@@ -38,8 +38,8 @@ void Task::finish() {
     fReporter->printStatus(this->name(), SkTime::GetMSecs() - fStart);
 }
 
-void Task::reallySpawnChild(CpuTask* task) {
-    fTaskRunner->add(task);
+void Task::spawnChildNext(CpuTask* task) {
+    fTaskRunner->addNext(task);
 }
 
 CpuTask::CpuTask(Reporter* reporter, TaskRunner* taskRunner) : Task(reporter, taskRunner) {}
@@ -56,32 +56,32 @@ void CpuTask::run() {
 
 void CpuTask::spawnChild(CpuTask* task) {
     // Run children serially on this (CPU) thread.  This tends to save RAM and is usually no slower.
-    // Calling reallySpawnChild() is nearly equivalent, but it'd pointlessly contend on the
-    // threadpool; reallySpawnChild() is most useful when you want to change threadpools.
+    // Calling spawnChildNext() is nearly equivalent, but it'd pointlessly contend on the
+    // threadpool; spawnChildNext() is most useful when you want to change threadpools.
     task->run();
 }
 
 GpuTask::GpuTask(Reporter* reporter, TaskRunner* taskRunner) : Task(reporter, taskRunner) {}
 
-void GpuTask::run(GrContextFactory* factory) {
+void GpuTask::run(GrContextFactory& factory) {
     if (FLAGS_gpu && !this->shouldSkip()) {
         this->start();
-        if (!FLAGS_dryRun) this->draw(factory);
+        if (!FLAGS_dryRun) this->draw(&factory);
         this->finish();
         if (FLAGS_abandonGpuContext) {
-            factory->abandonContexts();
+            factory.abandonContexts();
         }
         if (FLAGS_resetGpuContext || FLAGS_abandonGpuContext) {
-            factory->destroyContexts();
+            factory.destroyContexts();
         }
     }
     SkDELETE(this);
 }
 
 void GpuTask::spawnChild(CpuTask* task) {
-    // Spawn a new task so it runs on the CPU threadpool instead of the GPU one we're on now.
+    // Really spawn a new task so it runs on the CPU threadpool instead of the GPU one we're on now.
     // It goes on the front of the queue to minimize the time we must hold reference bitmaps in RAM.
-    this->reallySpawnChild(task);
+    this->spawnChildNext(task);
 }
 
 }  // namespace DM
diff --git a/dm/DMTask.h b/dm/DMTask.h
index 3f41b497ba..32bb9486f5 100644
--- a/dm/DMTask.h
+++ b/dm/DMTask.h
@@ -1,10 +1,9 @@
 #ifndef DMTask_DEFINED
 #define DMTask_DEFINED
 
-#include "DMGpuSupport.h"
 #include "DMReporter.h"
+#include "DMGpuSupport.h"
 #include "SkRunnable.h"
-#include "SkTaskGroup.h"
 #include "SkTime.h"
 
 // DM will run() these tasks on one of two threadpools.
@@ -37,7 +36,7 @@ protected:
     void fail(const char* msg = NULL);
     void finish();
 
-    void reallySpawnChild(CpuTask* task);  // For now we don't allow GPU child tasks.
+    void spawnChildNext(CpuTask* task);  // For now we don't allow GPU child tasks.
 
 private:
     Reporter* fReporter;      // Unowned.
@@ -58,12 +57,12 @@ public:
     void spawnChild(CpuTask* task);
 };
 
-class GpuTask : public Task {
+class GpuTask : public Task, public SkTRunnable<GrContextFactory> {
  public:
     GpuTask(Reporter* reporter, TaskRunner* taskRunner);
     virtual ~GpuTask() {}
 
-    void run(GrContextFactory*);
+    void run(GrContextFactory&) SK_OVERRIDE;
     virtual void draw(GrContextFactory*) = 0;
 
     void spawnChild(CpuTask* task);
diff --git a/dm/DMTaskRunner.cpp b/dm/DMTaskRunner.cpp
index 30d07babc8..8a0bc838e1 100644
--- a/dm/DMTaskRunner.cpp
+++ b/dm/DMTaskRunner.cpp
@@ -3,19 +3,19 @@
 
 namespace DM {
 
-TaskRunner::TaskRunner(int cpuThreads) {
-    SkTaskGroup::SetThreadCount(cpuThreads);
-}
+TaskRunner::TaskRunner(int cpuThreads, int gpuThreads) : fCpu(cpuThreads), fGpu(gpuThreads) {}
 
-void TaskRunner::add(CpuTask* task) { fCpuWork.add(task);  }
-void TaskRunner::add(GpuTask* task) { fGpuWork.push(task); }
+void TaskRunner::add(CpuTask* task) { fCpu.add(task); }
+void TaskRunner::addNext(CpuTask* task) { fCpu.addNext(task); }
+void TaskRunner::add(GpuTask* task) { fGpu.add(task); }
 
 void TaskRunner::wait() {
-    GrContextFactory factory;
-    for (int i = 0; i < fGpuWork.count(); i++) {
-        fGpuWork[i]->run(&factory);
-    }
-    fCpuWork.wait();
+    // These wait calls block until each threadpool is done.  We don't allow
+    // spawning new child GPU tasks, so we can wait for that first knowing
+    // we'll never try to add to it later.  Same can't be said of the CPU pool:
+    // both CPU and GPU tasks can spawn off new CPU work, so we wait for that last.
+    fGpu.wait();
+    fCpu.wait();
 }
 
 }  // namespace DM
diff --git a/dm/DMTaskRunner.h b/dm/DMTaskRunner.h
index d147525380..dd1440ed9a 100644
--- a/dm/DMTaskRunner.h
+++ b/dm/DMTaskRunner.h
@@ -2,12 +2,11 @@
 #define DMTaskRunner_DEFINED
 
 #include "DMGpuSupport.h"
-#include "SkTDArray.h"
-#include "SkTaskGroup.h"
+#include "SkThreadPool.h"
 #include "SkTypes.h"
 
 // TaskRunner runs Tasks on one of two threadpools depending on the need for a GrContextFactory.
-// We fix the number of GPU threads to 1, but go nuts with CPU threads.
+// It's typically a good idea to run fewer GPU threads than CPU threads (go nuts with those).
 
 namespace DM {
 
@@ -16,16 +15,16 @@ class GpuTask;
 
 class TaskRunner : SkNoncopyable {
 public:
-    // 0 -> one thread per core
-    explicit TaskRunner(int cpuThreads);
+    explicit TaskRunner(int cpuThreads, int gpuThreads);
 
     void add(CpuTask* task);
+    void addNext(CpuTask* task);
     void add(GpuTask* task);
     void wait();
 
 private:
-    SkTaskGroup fCpuWork;
-    SkTDArray<GpuTask*> fGpuWork;
+    SkTThreadPool<void> fCpu;
+    SkTThreadPool<GrContextFactory> fGpu;
 };
 
 }  // namespace DM
diff --git a/dm/DMTestTask.cpp b/dm/DMTestTask.cpp
index ad0c3fb4f3..9e7f41e7d0 100644
--- a/dm/DMTestTask.cpp
+++ b/dm/DMTestTask.cpp
@@ -3,11 +3,17 @@
 #include "SkCommandLineFlags.h"
 #include "SkCommonFlags.h"
 
-DEFINE_bool2(pathOpsExtended, x, false, "Run extended pathOps tests.");
+// When PathOps threaded tests get going, they're briefly a big consumer of lots of RAM.
+// We disable the internal threading there by default on 32-bit builds.
+static const bool is32Bit = sizeof(void*) == 4;
+
+DEFINE_bool2(pathOpsExtended,     x, false, "Run extended pathOps tests.");
+DEFINE_bool2(pathOpsSingleThread, z, is32Bit, "Disallow pathOps tests from using threads.");
 
 namespace DM {
 
 bool TestReporter::allowExtendedTest() const { return FLAGS_pathOpsExtended; }
+bool TestReporter::allowThreaded()     const { return !FLAGS_pathOpsSingleThread; }
 bool TestReporter::verbose()           const { return FLAGS_veryVerbose; }
 
 static SkString test_name(const char* name) {
diff --git a/dm/DMTestTask.h b/dm/DMTestTask.h
index ceb0e12e96..a65f096c58 100644
--- a/dm/DMTestTask.h
+++ b/dm/DMTestTask.h
@@ -19,6 +19,7 @@ public:
 
 private:
   virtual bool allowExtendedTest() const SK_OVERRIDE;
+  virtual bool allowThreaded()     const SK_OVERRIDE;
   virtual bool verbose()           const SK_OVERRIDE;
 
   virtual void onReportFailed(const SkString& desc) SK_OVERRIDE {
diff --git a/gyp/dm.gypi b/gyp/dm.gypi
index 0d8652fd22..1ecccd82b2 100644
--- a/gyp/dm.gypi
+++ b/gyp/dm.gypi
@@ -47,8 +47,6 @@
     '../gm/gm.cpp',
     '../gm/gm_expectations.cpp',
 
-    '../src/utils/SkTaskGroup.cpp',
-
     '../src/pipe/utils/SamplePipeControllers.cpp',
     '../src/utils/debugger/SkDebugCanvas.cpp',
     '../src/utils/debugger/SkDrawCommand.cpp',
diff --git a/gyp/pathops_skpclip.gyp b/gyp/pathops_skpclip.gyp
index 32a909bd47..a1e51d65f8 100755
--- a/gyp/pathops_skpclip.gyp
+++ b/gyp/pathops_skpclip.gyp
@@ -15,7 +15,7 @@
         '../src/pipe/utils',
         '../src/utils',
       ],
-      'dependencies': [
+      'dependencies': [ 
         'flags.gyp:flags',
         'skia_lib.gyp:skia_lib',
         'tools.gyp:crash_handler',
@@ -24,7 +24,6 @@
       'sources': [
 		'../tests/PathOpsDebug.cpp',
         '../tests/PathOpsSkpClipTest.cpp',
-        '../src/utils/SkTaskGroup.cpp',
       ],
       'conditions': [
         [ 'skia_android_framework == 1', {
diff --git a/gyp/pathops_unittest.gyp b/gyp/pathops_unittest.gyp
index 98e74bd54c..d3152d2cf3 100644
--- a/gyp/pathops_unittest.gyp
+++ b/gyp/pathops_unittest.gyp
@@ -20,7 +20,6 @@
         '../tests/PathOpsDebug.cpp',
         '../tests/PathOpsOpLoopThreadedTest.cpp',
         '../tests/skia_test.cpp',
-        '../src/utils/SkTaskGroup.cpp',
       ],
       'conditions': [
         [ 'skia_android_framework == 1', {
diff --git a/gyp/tools.gyp b/gyp/tools.gyp
index ce397f3cf3..66b84db66f 100644
--- a/gyp/tools.gyp
+++ b/gyp/tools.gyp
@@ -167,7 +167,6 @@
         '../tools/skpdiff/SkImageDiffer.cpp',
         '../tools/skpdiff/SkPMetric.cpp',
         '../tools/skpdiff/skpdiff_util.cpp',
-        '../src/utils/SkTaskGroup.cpp',
       ],
       'include_dirs': [
         '../src/core/', # needed for SkTLList.h
diff --git a/gyp/utils.gypi b/gyp/utils.gypi
index e62d287bb9..9156b847c8 100644
--- a/gyp/utils.gypi
+++ b/gyp/utils.gypi
@@ -10,6 +10,7 @@
         # Classes for a threadpool.
         '<(skia_src_path)/utils/SkCondVar.h',
         '<(skia_src_path)/utils/SkRunnable.h',
+        '<(skia_src_path)/utils/SkThreadPool.h',
         '<(skia_src_path)/utils/SkCondVar.cpp',
 
         '<(skia_include_path)/utils/SkBoundaryPatch.h',
diff --git a/src/utils/SkRunnable.h b/src/utils/SkRunnable.h
index 7a93b60c89..5acf4dbc61 100644
--- a/src/utils/SkRunnable.h
+++ b/src/utils/SkRunnable.h
@@ -8,9 +8,18 @@
 #ifndef SkRunnable_DEFINED
 #define SkRunnable_DEFINED
 
-struct SkRunnable {
-    virtual ~SkRunnable() {};
+template <typename T>
+struct SkTRunnable {
+    virtual ~SkTRunnable() {};
+    virtual void run(T&) = 0;
+};
+
+template <>
+struct SkTRunnable<void> {
+    virtual ~SkTRunnable() {};
     virtual void run() = 0;
 };
 
+typedef SkTRunnable<void> SkRunnable;
+
 #endif
diff --git a/src/utils/SkTaskGroup.cpp b/src/utils/SkTaskGroup.cpp
deleted file mode 100644
index a42c0a43af..0000000000
--- a/src/utils/SkTaskGroup.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-#include "SkTaskGroup.h"
-
-#include "SkCondVar.h"
-#include "SkLazyPtr.h"
-#include "SkTDArray.h"
-#include "SkThread.h"
-#include "SkThreadUtils.h"
-
-#if defined(SK_BUILD_FOR_WIN32)
-    static inline int num_cores() {
-        SYSTEM_INFO sysinfo;
-        GetSystemInfo(&sysinfo);
-        return sysinfo.dwNumberOfProcessors;
-    }
-#else
-    #include <unistd.h>
-    static inline int num_cores() {
-        return (int) sysconf(_SC_NPROCESSORS_ONLN);
-    }
-#endif
-
-namespace {
-
-static int gThreadCount = 0;
-
-class ThreadPool : SkNoncopyable {
-public:
-    static void Add(SkRunnable* task, int32_t* pending) {
-        Global()->add(task, pending);
-    }
-
-    static void Wait(int32_t* pending) {
-        while (sk_acquire_load(pending) > 0) {  // Pairs with sk_atomic_dec here or in Loop.
-            // Lend a hand until our SkTaskGroup of interest is done.
-            ThreadPool* pool = Global();
-            Work work;
-            {
-                AutoLock lock(&pool->fReady);
-                if (pool->fWork.isEmpty()) {
-                    // Someone has picked up all the work (including ours).  How nice of them!
-                    // (They may still be working on it, so we can't assert *pending == 0 here.)
-                    continue;
-                }
-                pool->fWork.pop(&work);
-            }
-            // This Work isn't necessarily part of our SkTaskGroup of interest, but that's fine.
-            // We threads gotta stick together.  We're always making forward progress.
-            work.task->run();
-            sk_atomic_dec(work.pending);  // Release pairs with the sk_acquire_load() just above.
-        }
-    }
-
-private:
-    struct AutoLock {
-        AutoLock(SkCondVar* c) : fC(c) { fC->lock(); }
-        ~AutoLock() { fC->unlock(); }
-    private:
-        SkCondVar* fC;
-    };
-
-    struct Work {
-        SkRunnable* task;  // A task to ->run(),
-        int32_t* pending;  // then sk_atomic_dec(pending) afterwards.
-    };
-
-    static ThreadPool* Create() { return SkNEW(ThreadPool); }
-    static void Destroy(ThreadPool* p) { SkDELETE(p); }
-    static ThreadPool* Global() {
-        SK_DECLARE_STATIC_LAZY_PTR(ThreadPool, global, Create, Destroy);
-        return global.get();
-    }
-
-    ThreadPool() : fDraining(false) {
-        const int threads = gThreadCount ? gThreadCount : num_cores();
-        for (int i = 0; i < threads; i++) {
-            fThreads.push(SkNEW_ARGS(SkThread, (&ThreadPool::Loop, this)));
-            fThreads.top()->start();
-        }
-    }
-
-    ~ThreadPool() {
-        SkASSERT(fWork.isEmpty());  // All SkTaskGroups should be destroyed by now.
-        {
-            AutoLock lock(&fReady);
-            fDraining = true;
-            fReady.broadcast();
-        }
-        for (int i = 0; i < fThreads.count(); i++) {
-            fThreads[i]->join();
-        }
-        SkASSERT(fWork.isEmpty());  // Can't hurt to double check.
-        fThreads.deleteAll();
-    }
-
-    void add(SkRunnable* task, int32_t* pending) {
-        Work work = { task, pending };
-        sk_atomic_inc(pending);  // No barrier needed.
-        {
-            AutoLock lock(&fReady);
-            fWork.push(work);
-            fReady.signal();
-        }
-    }
-
-    static void Loop(void* arg) {
-        ThreadPool* pool = (ThreadPool*)arg;
-        Work work;
-        while (true) {
-            {
-                AutoLock lock(&pool->fReady);
-                while (pool->fWork.isEmpty()) {
-                    if (pool->fDraining) {
-                        return;
-                    }
-                    pool->fReady.wait();
-                }
-                pool->fWork.pop(&work);
-            }
-            work.task->run();
-            sk_atomic_dec(work.pending);  // Release pairs with sk_acquire_load() in Wait().
-        }
-    }
-
-    SkTDArray<Work>      fWork;
-    SkTDArray<SkThread*> fThreads;
-    SkCondVar            fReady;
-    bool                 fDraining;
-};
-
-}  // namespace
-
-void SkTaskGroup::SetThreadCount(int n) { gThreadCount = n; }
-
-SkTaskGroup::SkTaskGroup() : fPending(0) {}
-
-void SkTaskGroup::add(SkRunnable* task) { ThreadPool::Add(task, &fPending); }
-void SkTaskGroup::wait()                { ThreadPool::Wait(&fPending); }
diff --git a/src/utils/SkTaskGroup.h b/src/utils/SkTaskGroup.h
deleted file mode 100644
index af4d47aa97..0000000000
--- a/src/utils/SkTaskGroup.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2014 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#ifndef SkTaskGroup_DEFINED
-#define SkTaskGroup_DEFINED
-
-#include "SkTypes.h"
-#include "SkRunnable.h"
-
-class SkTaskGroup : SkNoncopyable {
-public:
-    // Call before creating any SkTaskGroup to set the number of threads all SkTaskGroups share.
-    // If not called, we default to the number of system-reported cores.
-    static void SetThreadCount(int);
-
-    SkTaskGroup();
-    ~SkTaskGroup() { this->wait(); }
-
-    // Add a task to this SkTaskGroup.  It will likely run() on another thread.
-    void add(SkRunnable*);
-
-    // Block until all Tasks previously add()ed to this SkTaskGroup have run().
-    // You may safely reuse this SkTaskGroup after wait() returns.
-    void wait();
-
-private:
-    /*atomic*/ int32_t fPending;
-};
-
-#endif//SkTaskGroup_DEFINED
diff --git a/src/utils/SkThreadPool.h b/src/utils/SkThreadPool.h
new file mode 100644
index 0000000000..c99c5c4188
--- /dev/null
+++ b/src/utils/SkThreadPool.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkThreadPool_DEFINED
+#define SkThreadPool_DEFINED
+
+#include "SkCondVar.h"
+#include "SkRunnable.h"
+#include "SkTDArray.h"
+#include "SkTInternalLList.h"
+#include "SkThreadUtils.h"
+#include "SkTypes.h"
+
+#if defined(SK_BUILD_FOR_UNIX) || defined(SK_BUILD_FOR_MAC) || defined(SK_BUILD_FOR_ANDROID)
+#    include <unistd.h>
+#endif
+
+// Returns the number of cores on this machine.
+static inline int num_cores() {
+#if defined(SK_BUILD_FOR_WIN32)
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo(&sysinfo);
+    return sysinfo.dwNumberOfProcessors;
+#elif defined(SK_BUILD_FOR_UNIX) || defined(SK_BUILD_FOR_MAC) || defined(SK_BUILD_FOR_ANDROID)
+    return (int) sysconf(_SC_NPROCESSORS_ONLN);
+#else
+    return 1;
+#endif
+}
+
+template <typename T>
+class SkTThreadPool {
+public:
+    /**
+     * Create a threadpool with count threads, or one thread per core if kThreadPerCore.
+     */
+    static const int kThreadPerCore = -1;
+    explicit SkTThreadPool(int count);
+    ~SkTThreadPool();
+
+    /**
+     * Queues up an SkRunnable to run when a thread is available, or synchronously if count is 0.
+     * Does not take ownership. NULL is a safe no-op.  If T is not void, the runnable will be passed
+     * a reference to a T on the thread's local stack.
+     */
+    void add(SkTRunnable<T>*);
+
+    /**
+     * Same as add, but adds the runnable as the very next to run rather than enqueueing it.
+     */
+    void addNext(SkTRunnable<T>*);
+
+    /**
+     * Block until all added SkRunnables have completed.  Once called, calling add() is undefined.
+     */
+    void wait();
+
+ private:
+    struct LinkedRunnable {
+        SkTRunnable<T>* fRunnable;  // Unowned.
+        SK_DECLARE_INTERNAL_LLIST_INTERFACE(LinkedRunnable);
+    };
+
+    enum State {
+        kRunning_State,  // Normal case.  We've been constructed and no one has called wait().
+        kWaiting_State,  // wait has been called, but there still might be work to do or being done.
+        kHalting_State,  // There's no work to do and no thread is busy.  All threads can shut down.
+    };
+
+    void addSomewhere(SkTRunnable<T>* r,
+                      void (SkTInternalLList<LinkedRunnable>::*)(LinkedRunnable*));
+
+    SkTInternalLList<LinkedRunnable> fQueue;
+    SkCondVar                        fReady;
+    SkTDArray<SkThread*>             fThreads;
+    State                            fState;
+    int                              fBusyThreads;
+
+    static void Loop(void*);  // Static because we pass in this.
+};
+
+template <typename T>
+SkTThreadPool<T>::SkTThreadPool(int count) : fState(kRunning_State), fBusyThreads(0) {
+    if (count < 0) {
+        count = num_cores();
+    }
+    // Create count threads, all running SkTThreadPool::Loop.
+    for (int i = 0; i < count; i++) {
+        SkThread* thread = SkNEW_ARGS(SkThread, (&SkTThreadPool::Loop, this));
+        *fThreads.append() = thread;
+        thread->start();
+    }
+}
+
+template <typename T>
+SkTThreadPool<T>::~SkTThreadPool() {
+    if (kRunning_State == fState) {
+        this->wait();
+    }
+}
+
+namespace SkThreadPoolPrivate {
+
+template <typename T>
+struct ThreadLocal {
+    void run(SkTRunnable<T>* r) { r->run(data); }
+    T data;
+};
+
+template <>
+struct ThreadLocal<void> {
+    void run(SkTRunnable<void>* r) { r->run(); }
+};
+
+}  // namespace SkThreadPoolPrivate
+
+template <typename T>
+void SkTThreadPool<T>::addSomewhere(SkTRunnable<T>* r,
+                                    void (SkTInternalLList<LinkedRunnable>::* f)(LinkedRunnable*)) {
+    if (r == NULL) {
+        return;
+    }
+
+    if (fThreads.isEmpty()) {
+        SkThreadPoolPrivate::ThreadLocal<T> threadLocal;
+        threadLocal.run(r);
+        return;
+    }
+
+    LinkedRunnable* linkedRunnable = SkNEW(LinkedRunnable);
+    linkedRunnable->fRunnable = r;
+    fReady.lock();
+    SkASSERT(fState != kHalting_State);  // Shouldn't be able to add work when we're halting.
+    (fQueue.*f)(linkedRunnable);
+    fReady.signal();
+    fReady.unlock();
+}
+
+template <typename T>
+void SkTThreadPool<T>::add(SkTRunnable<T>* r) {
+    this->addSomewhere(r, &SkTInternalLList<LinkedRunnable>::addToTail);
+}
+
+template <typename T>
+void SkTThreadPool<T>::addNext(SkTRunnable<T>* r) {
+    this->addSomewhere(r, &SkTInternalLList<LinkedRunnable>::addToHead);
+}
+
+
+template <typename T>
+void SkTThreadPool<T>::wait() {
+    fReady.lock();
+    fState = kWaiting_State;
+    fReady.broadcast();
+    fReady.unlock();
+
+    // Wait for all threads to stop.
+    for (int i = 0; i < fThreads.count(); i++) {
+        fThreads[i]->join();
+        SkDELETE(fThreads[i]);
+    }
+    SkASSERT(fQueue.isEmpty());
+}
+
+template <typename T>
+/*static*/ void SkTThreadPool<T>::Loop(void* arg) {
+    // The SkTThreadPool passes itself as arg to each thread as they're created.
+    SkTThreadPool<T>* pool = static_cast<SkTThreadPool<T>*>(arg);
+    SkThreadPoolPrivate::ThreadLocal<T> threadLocal;
+
+    while (true) {
+        // We have to be holding the lock to read the queue and to call wait.
+        pool->fReady.lock();
+        while(pool->fQueue.isEmpty()) {
+            // Does the client want to stop and are all the threads ready to stop?
+            // If so, we move into the halting state, and whack all the threads so they notice.
+            if (kWaiting_State == pool->fState && pool->fBusyThreads == 0) {
+                pool->fState = kHalting_State;
+                pool->fReady.broadcast();
+            }
+            // Any time we find ourselves in the halting state, it's quitting time.
+            if (kHalting_State == pool->fState) {
+                pool->fReady.unlock();
+                return;
+            }
+            // wait yields the lock while waiting, but will have it again when awoken.
+            pool->fReady.wait();
+        }
+        // We've got the lock back here, no matter if we ran wait or not.
+
+        // The queue is not empty, so we have something to run.  Claim it.
+        LinkedRunnable* r = pool->fQueue.head();
+
+        pool->fQueue.remove(r);
+
+        // Having claimed our SkRunnable, we now give up the lock while we run it.
+        // Otherwise, we'd only ever do work on one thread at a time, which rather
+        // defeats the point of this code.
+        pool->fBusyThreads++;
+        pool->fReady.unlock();
+
+        // OK, now really do the work.
+        threadLocal.run(r->fRunnable);
+        SkDELETE(r);
+
+        // Let everyone know we're not busy.
+        pool->fReady.lock();
+        pool->fBusyThreads--;
+        pool->fReady.unlock();
+    }
+
+    SkASSERT(false); // Unreachable.  The only exit happens when pool->fState is kHalting_State.
+}
+
+typedef SkTThreadPool<void> SkThreadPool;
+
+#endif
diff --git a/tests/OnceTest.cpp b/tests/OnceTest.cpp
index 192abaaee3..389d257b73 100644
--- a/tests/OnceTest.cpp
+++ b/tests/OnceTest.cpp
@@ -6,7 +6,7 @@
  */
 
 #include "SkOnce.h"
-#include "SkTaskGroup.h"
+#include "SkThreadPool.h"
 #include "Test.h"
 
 static void add_five(int* x) {
@@ -42,7 +42,7 @@ public:
 };
 
 DEF_TEST(SkOnce_Multithreaded, r) {
-    const int kTasks = 16;
+    const int kTasks = 16, kThreads = 4;
 
     // Make a bunch of tasks that will race to be the first to add six to x.
     Racer racers[kTasks];
@@ -54,11 +54,11 @@ DEF_TEST(SkOnce_Multithreaded, r) {
     }
 
     // Let them race.
-    SkTaskGroup tg;
+    SkThreadPool pool(kThreads);
     for (int i = 0; i < kTasks; i++) {
-        tg.add(&racers[i]);
+        pool.add(&racers[i]);
     }
-    tg.wait();
+    pool.wait();
 
     // Only one should have done the +=.
     REPORTER_ASSERT(r, 6 == x);
diff --git a/tests/PathOpsExtendedTest.cpp b/tests/PathOpsExtendedTest.cpp
index f90b9fec69..05d00045b9 100644
--- a/tests/PathOpsExtendedTest.cpp
+++ b/tests/PathOpsExtendedTest.cpp
@@ -14,8 +14,8 @@
 #include "SkPaint.h"
 #include "SkRTConf.h"
 #include "SkStream.h"
-#include "SkTaskGroup.h"
 #include "SkThread.h"
+#include "SkThreadPool.h"
 
 #ifdef SK_BUILD_FOR_MAC
 #include <sys/sysctl.h>
@@ -542,7 +542,7 @@ bool testThreadedPathOp(skiatest::Reporter* reporter, const SkPath& a, const SkP
 
 SK_DECLARE_STATIC_MUTEX(gMutex);
 
-void initializeTests(skiatest::Reporter* reporter, const char* test) {
+int initializeTests(skiatest::Reporter* reporter, const char* test) {
 #if 0  // doesn't work yet
     SK_CONF_SET("images.jpeg.suppressDecoderWarnings", true);
     SK_CONF_SET("images.png.suppressDecoderWarnings", true);
@@ -566,6 +566,7 @@ void initializeTests(skiatest::Reporter* reporter, const char* test) {
             }
         }
     }
+    return reporter->allowThreaded() ? SkThreadPool::kThreadPerCore : 1;
 }
 
 void outputProgress(char* ramStr, const char* pathStr, SkPath::FillType pathFillType) {
diff --git a/tests/PathOpsExtendedTest.h b/tests/PathOpsExtendedTest.h
index a854410139..5f3413c572 100644
--- a/tests/PathOpsExtendedTest.h
+++ b/tests/PathOpsExtendedTest.h
@@ -36,7 +36,7 @@ extern bool testSimplify(SkPath& path, bool useXor, SkPath& out, PathOpsThreadSt
                          const char* pathStr);
 extern bool testSimplify(skiatest::Reporter* reporter, const SkPath& path, const char* filename);
 
-void initializeTests(skiatest::Reporter* reporter, const char* testName);
+int initializeTests(skiatest::Reporter* reporter, const char* testName);
 void outputProgress(char* ramStr, const char* pathStr, SkPath::FillType );
 void outputProgress(char* ramStr, const char* pathStr, SkPathOp op);
 
diff --git a/tests/PathOpsOpCubicThreadedTest.cpp b/tests/PathOpsOpCubicThreadedTest.cpp
index 751ccc5f1b..889ade0487 100644
--- a/tests/PathOpsOpCubicThreadedTest.cpp
+++ b/tests/PathOpsOpCubicThreadedTest.cpp
@@ -67,8 +67,8 @@ static void testOpCubicsMain(PathOpsThreadState* data) {
 }
 
 DEF_TEST(PathOpsOpCubicsThreaded, reporter) {
-    initializeTests(reporter, "cubicOp");
-    PathOpsThreadedTestRunner testRunner(reporter);
+    int threadCount = initializeTests(reporter, "cubicOp");
+    PathOpsThreadedTestRunner testRunner(reporter, threadCount);
     for (int a = 0; a < 6; ++a) {  // outermost
         for (int b = a + 1; b < 7; ++b) {
             for (int c = 0 ; c < 6; ++c) {
diff --git a/tests/PathOpsOpLoopThreadedTest.cpp b/tests/PathOpsOpLoopThreadedTest.cpp
index 3f316d1fe9..71efff3edc 100755
--- a/tests/PathOpsOpLoopThreadedTest.cpp
+++ b/tests/PathOpsOpLoopThreadedTest.cpp
@@ -62,8 +62,8 @@ static void testOpLoopsMain(PathOpsThreadState* data) {
 }
 
 DEF_TEST(PathOpsOpLoopsThreaded, reporter) {
-    initializeTests(reporter, "cubicOp");
-    PathOpsThreadedTestRunner testRunner(reporter);
+    int threadCount = initializeTests(reporter, "cubicOp");
+    PathOpsThreadedTestRunner testRunner(reporter, threadCount);
     for (int a = 0; a < 6; ++a) {  // outermost
         for (int b = a + 1; b < 7; ++b) {
             for (int c = 0 ; c < 6; ++c) {
@@ -81,7 +81,7 @@ finish:
 }
 
 DEF_TEST(PathOpsOpLoops, reporter) {
-    initializeTests(reporter, "cubicOp");
+    (void) initializeTests(reporter, "cubicOp");
     PathOpsThreadState state;
     state.fReporter = reporter;
     SkBitmap bitmap;
diff --git a/tests/PathOpsOpRectThreadedTest.cpp b/tests/PathOpsOpRectThreadedTest.cpp
index 1b6e4e86b9..3d07d74bb0 100644
--- a/tests/PathOpsOpRectThreadedTest.cpp
+++ b/tests/PathOpsOpRectThreadedTest.cpp
@@ -74,8 +74,8 @@ static void testPathOpsRectsMain(PathOpsThreadState* data)
 }
 
 DEF_TEST(PathOpsRectsThreaded, reporter) {
-    initializeTests(reporter, "testOp");
-    PathOpsThreadedTestRunner testRunner(reporter);
+    int threadCount = initializeTests(reporter, "testOp");
+    PathOpsThreadedTestRunner testRunner(reporter, threadCount);
     for (int a = 0; a < 6; ++a) {  // outermost
         for (int b = a + 1; b < 7; ++b) {
             for (int c = 0 ; c < 6; ++c) {
diff --git a/tests/PathOpsQuadLineIntersectionThreadedTest.cpp b/tests/PathOpsQuadLineIntersectionThreadedTest.cpp
index 7e33b7b374..fd7581fa58 100644
--- a/tests/PathOpsQuadLineIntersectionThreadedTest.cpp
+++ b/tests/PathOpsQuadLineIntersectionThreadedTest.cpp
@@ -111,8 +111,8 @@ static void testQuadLineIntersectMain(PathOpsThreadState* data)
 }
 
 DEF_TEST(PathOpsQuadLineIntersectionThreaded, reporter) {
-    initializeTests(reporter, "testQuadLineIntersect");
-    PathOpsThreadedTestRunner testRunner(reporter);
+    int threadCount = initializeTests(reporter, "testQuadLineIntersect");
+    PathOpsThreadedTestRunner testRunner(reporter, threadCount);
     for (int a = 0; a < 16; ++a) {
         for (int b = 0 ; b < 16; ++b) {
             for (int c = 0 ; c < 16; ++c) {
diff --git a/tests/PathOpsSimplifyDegenerateThreadedTest.cpp b/tests/PathOpsSimplifyDegenerateThreadedTest.cpp
index 8e8c58bf44..5cd3c35d8a 100755
--- a/tests/PathOpsSimplifyDegenerateThreadedTest.cpp
+++ b/tests/PathOpsSimplifyDegenerateThreadedTest.cpp
@@ -68,8 +68,8 @@ static void testSimplifyDegeneratesMain(PathOpsThreadState* data) {
 }
 
 DEF_TEST(PathOpsSimplifyDegeneratesThreaded, reporter) {
-    initializeTests(reporter, "testDegenerates");
-    PathOpsThreadedTestRunner testRunner(reporter);
+    int threadCount = initializeTests(reporter, "testDegenerates");
+    PathOpsThreadedTestRunner testRunner(reporter, threadCount);
     for (int a = 0; a < 16; ++a) {
         int ax = a & 0x03;
         int ay = a >> 2;
diff --git a/tests/PathOpsSimplifyQuadThreadedTest.cpp b/tests/PathOpsSimplifyQuadThreadedTest.cpp
index 3c92cca217..dbbec3e3e6 100644
--- a/tests/PathOpsSimplifyQuadThreadedTest.cpp
+++ b/tests/PathOpsSimplifyQuadThreadedTest.cpp
@@ -74,8 +74,8 @@ static void testSimplifyQuadsMain(PathOpsThreadState* data)
 }
 
 DEF_TEST(PathOpsSimplifyQuadsThreaded, reporter) {
-    initializeTests(reporter, "testQuads");
-    PathOpsThreadedTestRunner testRunner(reporter);
+    int threadCount = initializeTests(reporter, "testQuads");
+    PathOpsThreadedTestRunner testRunner(reporter, threadCount);
     int a = 0;
     for (; a < 16; ++a) {
         for (int b = a ; b < 16; ++b) {
diff --git a/tests/PathOpsSimplifyQuadralateralsThreadedTest.cpp b/tests/PathOpsSimplifyQuadralateralsThreadedTest.cpp
index f8e9a6e3dc..afa9200389 100755
--- a/tests/PathOpsSimplifyQuadralateralsThreadedTest.cpp
+++ b/tests/PathOpsSimplifyQuadralateralsThreadedTest.cpp
@@ -76,8 +76,8 @@ static void testSimplifyQuadralateralsMain(PathOpsThreadState* data)
 }
 
 DEF_TEST(PathOpsSimplifyQuadralateralsThreaded, reporter) {
-    initializeTests(reporter, "testQuadralaterals");
-    PathOpsThreadedTestRunner testRunner(reporter);
+    int threadCount = initializeTests(reporter, "testQuadralaterals");
+    PathOpsThreadedTestRunner testRunner(reporter, threadCount);
     for (int a = 0; a < 16; ++a) {
         for (int b = a ; b < 16; ++b) {
             for (int c = b ; c < 16; ++c) {
diff --git a/tests/PathOpsSimplifyRectThreadedTest.cpp b/tests/PathOpsSimplifyRectThreadedTest.cpp
index 52a78ece84..9e6a5eaf03 100644
--- a/tests/PathOpsSimplifyRectThreadedTest.cpp
+++ b/tests/PathOpsSimplifyRectThreadedTest.cpp
@@ -187,8 +187,8 @@ static void testSimplify4x4RectsMain(PathOpsThreadState* data)
 }
 
 DEF_TEST(PathOpsSimplifyRectsThreaded, reporter) {
-    initializeTests(reporter, "testLine");
-    PathOpsThreadedTestRunner testRunner(reporter);
+    int threadCount = initializeTests(reporter, "testLine");
+    PathOpsThreadedTestRunner testRunner(reporter, threadCount);
     for (int a = 0; a < 8; ++a) {  // outermost
         for (int b = a ; b < 8; ++b) {
             for (int c = b ; c < 8; ++c) {
diff --git a/tests/PathOpsSimplifyTrianglesThreadedTest.cpp b/tests/PathOpsSimplifyTrianglesThreadedTest.cpp
index ee0ca2bcaa..b5d6508846 100755
--- a/tests/PathOpsSimplifyTrianglesThreadedTest.cpp
+++ b/tests/PathOpsSimplifyTrianglesThreadedTest.cpp
@@ -73,8 +73,8 @@ static void testSimplifyTrianglesMain(PathOpsThreadState* data) {
 }
 
 DEF_TEST(PathOpsSimplifyTrianglesThreaded, reporter) {
-    initializeTests(reporter, "testTriangles");
-    PathOpsThreadedTestRunner testRunner(reporter);
+    int threadCount = initializeTests(reporter, "testTriangles");
+    PathOpsThreadedTestRunner testRunner(reporter, threadCount);
     for (int a = 0; a < 15; ++a) {
         int ax = a & 0x03;
         int ay = a >> 2;
diff --git a/tests/PathOpsSkpClipTest.cpp b/tests/PathOpsSkpClipTest.cpp
index 0769141685..cdc3c1fcd9 100755
--- a/tests/PathOpsSkpClipTest.cpp
+++ b/tests/PathOpsSkpClipTest.cpp
@@ -20,8 +20,8 @@
 #include "SkString.h"
 #include "SkTArray.h"
 #include "SkTDArray.h"
-#include "SkTaskGroup.h"
 #include "SkTemplates.h"
+#include "SkThreadPool.h"
 #include "SkTime.h"
 
 __SK_FORCE_IMAGE_DECODER_LINKING;
@@ -90,14 +90,14 @@ static SkString get_in_path(int dirNo, const char* filename) {
     }
     if (filename) {
         path.appendf("%s%s", PATH_SLASH, filename);
-        if (!sk_exists(path.c_str())) {
+        if (!sk_exists(path.c_str())) {        
             SkDebugf("could not read %s\n", path.c_str());
             return SkString();
         }
     }
     return path;
 }
-
+    
 static void make_recursive_dir(const SkString& path) {
     if (sk_exists(path.c_str())) {
         return;
@@ -129,7 +129,7 @@ static SkString get_out_path(int dirNo, const char* dirName) {
     make_recursive_dir(path);
     return path;
 }
-
+  
 static SkString get_sum_path(const char* dirName) {
     SkString path;
     SkASSERT(dirName);
@@ -166,12 +166,12 @@ struct TestResult {
         fTestStep = kCompareBits;
         fScale = 1;
     }
-
+    
     void init(int dirNo, const SkString& filename) {
         fDirNo = dirNo;
         strcpy(fFilename, filename.c_str());
         fTestStep = kCompareBits;
-        fScale = 1;
+        fScale = 1;       
     }
 
     SkString status() {
@@ -204,7 +204,7 @@ struct TestResult {
     }
 
     void testOne();
-
+    
     char fFilename[kMaxLength];
     TestStep fTestStep;
     int fDirNo;
@@ -245,8 +245,13 @@ struct TestState {
 };
 
 struct TestRunner {
+    TestRunner(int threadCount)
+        : fNumThreads(threadCount) {
+    }
+
     ~TestRunner();
     void render();
+    int fNumThreads;
     SkTDArray<class TestRunnable*> fRunnables;
 };
 
@@ -295,9 +300,9 @@ TestRunner::~TestRunner() {
 }
 
 void TestRunner::render() {
-    SkTaskGroup tg;
+    SkThreadPool pool(fNumThreads);
     for (int index = 0; index < fRunnables.count(); ++ index) {
-        tg.add(fRunnables[index]);
+        pool.add(fRunnables[index]);
     }
 }
 
@@ -526,12 +531,18 @@ DEFINE_string2(dir, d, NULL, "range of directories (e.g., 1-100)");
 DEFINE_string2(skp, s, NULL, "skp to test");
 DEFINE_bool2(single, z, false, "run tests on a single thread internally.");
 DEFINE_int32(testIndex, 0, "override local test index (PathOpsSkpClipOneOff only).");
+DEFINE_int32(threads, SkThreadPool::kThreadPerCore,
+        "Run threadsafe tests on a threadpool with this many threads.");
 DEFINE_bool2(verbose, v, false, "enable verbose output.");
 
 static bool verbose() {
     return FLAGS_verbose;
 }
 
+static int getThreadCount() {
+    return FLAGS_single ? 1 : FLAGS_threads;
+}
+
 class Dirs {
 public:
     Dirs() {
@@ -605,7 +616,7 @@ public:
         }
         return NULL;
     }
-
+    
     void set(const SkCommandLineFlags::StringArray& names) {
         fNames = &names;
     }
@@ -615,7 +626,7 @@ private:
     const SkCommandLineFlags::StringArray* fNames;
 } gNames;
 
-static bool buildTestDir(int dirNo, int firstDirNo,
+static bool buildTestDir(int dirNo, int firstDirNo, 
         SkTDArray<TestResult>* tests, SkTDArray<SortByName*>* sorted) {
     SkString dirName = get_out_path(dirNo, outStatusDir);
     if (!dirName.size()) {
@@ -781,7 +792,8 @@ static void encodeFound(TestState& state) {
             }
         }
     }
-    TestRunner testRunner;
+    int threadCount = getThreadCount();
+    TestRunner testRunner(threadCount);
     for (int index = 0; index < state.fPixelWorst.count(); ++index) {
         const TestResult& result = state.fPixelWorst[index];
         SkString filename(result.fFilename);
@@ -853,7 +865,8 @@ static void testSkpClipMain(TestState* data) {
 DEF_TEST(PathOpsSkpClipThreaded) {
     gDirs.setDefault();
     initTest();
-    TestRunner testRunner;
+    int threadCount = getThreadCount();
+    TestRunner testRunner(threadCount);
     int dirNo;
     gDirs.reset();
     while ((dirNo = gDirs.next()) > 0) {
@@ -876,7 +889,7 @@ DEF_TEST(PathOpsSkpClipThreaded) {
     }
     encodeFound(state);
 }
-
+ 
 static bool buildTests(SkTDArray<TestResult>* tests, SkTDArray<SortByName*>* sorted) {
     int firstDirNo = gDirs.first();
     int dirNo;
@@ -899,7 +912,8 @@ DEF_TEST(PathOpsSkpClipUberThreaded) {
     if (!buildTests(tests.get(), sorted.get())) {
         return;
     }
-    TestRunner testRunner;
+    int threadCount = getThreadCount();
+    TestRunner testRunner(threadCount);
     int dirNo;
     gDirs.reset();
     while ((dirNo = gDirs.next()) > 0) {
diff --git a/tests/PathOpsThreadedCommon.cpp b/tests/PathOpsThreadedCommon.cpp
index 0adde915e0..ac4cd6ba62 100644
--- a/tests/PathOpsThreadedCommon.cpp
+++ b/tests/PathOpsThreadedCommon.cpp
@@ -7,7 +7,7 @@
 
 #include "PathOpsExtendedTest.h"
 #include "PathOpsThreadedCommon.h"
-#include "SkTaskGroup.h"
+#include "SkThreadPool.h"
 
 PathOpsThreadedTestRunner::~PathOpsThreadedTestRunner() {
     for (int index = 0; index < fRunnables.count(); index++) {
@@ -16,8 +16,8 @@ PathOpsThreadedTestRunner::~PathOpsThreadedTestRunner() {
 }
 
 void PathOpsThreadedTestRunner::render() {
-    SkTaskGroup tg;
+    SkThreadPool pool(fNumThreads);
     for (int index = 0; index < fRunnables.count(); ++ index) {
-        tg.add(fRunnables[index]);
+        pool.add(fRunnables[index]);
     }
 }
diff --git a/tests/PathOpsThreadedCommon.h b/tests/PathOpsThreadedCommon.h
index 124921e389..a638cd2fdf 100644
--- a/tests/PathOpsThreadedCommon.h
+++ b/tests/PathOpsThreadedCommon.h
@@ -33,13 +33,17 @@ struct PathOpsThreadState {
 
 class PathOpsThreadedTestRunner {
 public:
-    PathOpsThreadedTestRunner(skiatest::Reporter* reporter) : fReporter(reporter) {}
+    PathOpsThreadedTestRunner(skiatest::Reporter* reporter, int threadCount)
+        : fNumThreads(threadCount)
+        , fReporter(reporter) {
+    }
 
     ~PathOpsThreadedTestRunner();
 
     void render();
 
 public:
+    int fNumThreads;
     SkTDArray<PathOpsThreadedRunnable*> fRunnables;
     skiatest::Reporter* fReporter;
 };
diff --git a/tests/PathOpsTightBoundsTest.cpp b/tests/PathOpsTightBoundsTest.cpp
index cea37520b1..09f962296f 100644
--- a/tests/PathOpsTightBoundsTest.cpp
+++ b/tests/PathOpsTightBoundsTest.cpp
@@ -35,8 +35,8 @@ static void testTightBoundsLines(PathOpsThreadState* data) {
 }
 
 DEF_TEST(PathOpsTightBoundsLines, reporter) {
-    initializeTests(reporter, "tightBoundsLines");
-    PathOpsThreadedTestRunner testRunner(reporter);
+    int threadCount = initializeTests(reporter, "tightBoundsLines");
+    PathOpsThreadedTestRunner testRunner(reporter, threadCount);
     int outerCount = reporter->allowExtendedTest() ? 100 : 1;
     for (int index = 0; index < outerCount; ++index) {
         for (int idx2 = 0; idx2 < 10; ++idx2) {
@@ -110,8 +110,8 @@ static void testTightBoundsQuads(PathOpsThreadState* data) {
 }
 
 DEF_TEST(PathOpsTightBoundsQuads, reporter) {
-    initializeTests(reporter, "tightBoundsQuads");
-    PathOpsThreadedTestRunner testRunner(reporter);
+    int threadCount = initializeTests(reporter, "tightBoundsQuads");
+    PathOpsThreadedTestRunner testRunner(reporter, threadCount);
     int outerCount = reporter->allowExtendedTest() ? 100 : 1;
     for (int index = 0; index < outerCount; ++index) {
         for (int idx2 = 0; idx2 < 10; ++idx2) {
diff --git a/tests/SkpSkGrTest.cpp b/tests/SkpSkGrTest.cpp
index c882654650..c1883a9890 100644
--- a/tests/SkpSkGrTest.cpp
+++ b/tests/SkpSkGrTest.cpp
@@ -23,7 +23,7 @@
 #include "SkString.h"
 #include "SkTArray.h"
 #include "SkTDArray.h"
-#include "SkTaskGroup.h"
+#include "SkThreadPool.h"
 #include "SkTime.h"
 #include "Test.h"
 
@@ -125,12 +125,14 @@ struct SkpSkGrThreadState {
 };
 
 struct SkpSkGrThreadedTestRunner {
-    SkpSkGrThreadedTestRunner(skiatest::Reporter* reporter)
-        : fReporter(reporter) {
+    SkpSkGrThreadedTestRunner(skiatest::Reporter* reporter, int threadCount)
+        : fNumThreads(threadCount)
+        , fReporter(reporter) {
     }
 
     ~SkpSkGrThreadedTestRunner();
     void render();
+    int fNumThreads;
     SkTDArray<SkpSkGrThreadedRunnable*> fRunnables;
     skiatest::Reporter* fReporter;
 };
@@ -162,9 +164,9 @@ SkpSkGrThreadedTestRunner::~SkpSkGrThreadedTestRunner() {
 }
 
 void SkpSkGrThreadedTestRunner::render() {
-    SkTaskGroup tg;
+    SkThreadPool pool(fNumThreads);
     for (int index = 0; index < fRunnables.count(); ++ index) {
-        tg.add(fRunnables[index]);
+        pool.add(fRunnables[index]);
     }
 }
 
@@ -674,7 +676,8 @@ DEF_TEST(SkpSkGrThreaded, reporter) {
     if (!initTest()) {
         return;
     }
-    SkpSkGrThreadedTestRunner testRunner(reporter);
+    int threadCount = reporter->allowThreaded() ? 3 : 1;
+    SkpSkGrThreadedTestRunner testRunner(reporter, threadCount);
     for (int dirIndex = 1; dirIndex <= 100; ++dirIndex) {
         SkString pictDir = make_in_dir_name(dirIndex);
         if (pictDir.size() == 0) {
diff --git a/tests/Test.cpp b/tests/Test.cpp
index d0147e1e4a..20afd45561 100644
--- a/tests/Test.cpp
+++ b/tests/Test.cpp
@@ -75,6 +75,10 @@ protected:
         return fReporter->allowExtendedTest();
     }
 
+    virtual bool allowThreaded() const SK_OVERRIDE {
+        return fReporter->allowThreaded();
+    }
+
     virtual void bumpTestCount() SK_OVERRIDE {
         fReporter->bumpTestCount();
     }
diff --git a/tests/Test.h b/tests/Test.h
index 6c85b32bce..72b0bee126 100644
--- a/tests/Test.h
+++ b/tests/Test.h
@@ -32,6 +32,7 @@ namespace skiatest {
         void endTest(Test*);
 
         virtual bool allowExtendedTest() const { return false; }
+        virtual bool allowThreaded() const { return false; }
         virtual bool verbose() const { return false; }
         virtual void bumpTestCount() { sk_atomic_inc(&fTestCount); }
 
diff --git a/tests/skia_test.cpp b/tests/skia_test.cpp
index a73b6bae53..97ac29b0fd 100644
--- a/tests/skia_test.cpp
+++ b/tests/skia_test.cpp
@@ -12,8 +12,8 @@
 #include "SkGraphics.h"
 #include "SkOSFile.h"
 #include "SkTArray.h"
-#include "SkTaskGroup.h"
 #include "SkTemplates.h"
+#include "SkThreadPool.h"
 #include "SkTime.h"
 #include "Test.h"
 
@@ -54,6 +54,7 @@ public:
     explicit DebugfReporter(int total) : fDone(0), fTotal(total) {}
 
     virtual bool allowExtendedTest() const SK_OVERRIDE { return FLAGS_extendedTest; }
+    virtual bool allowThreaded()     const SK_OVERRIDE { return !FLAGS_single; }
     virtual bool verbose()           const SK_OVERRIDE { return FLAGS_veryVerbose; }
 
 protected:
@@ -174,8 +175,7 @@ int test_main() {
     int32_t failCount = 0;
     int skipCount = 0;
 
-    SkTaskGroup::SetThreadCount(FLAGS_threads);
-    SkTaskGroup cpuTests;
+    SkThreadPool threadpool(FLAGS_threads);
     SkTArray<Test*> gpuTests;  // Always passes ownership to an SkTestRunnable
 
     DebugfReporter reporter(toRun);
@@ -186,7 +186,7 @@ int test_main() {
         } else if (test->isGPUTest()) {
             gpuTests.push_back() = test.detach();
         } else {
-            cpuTests.add(SkNEW_ARGS(SkTestRunnable, (test.detach(), &failCount)));
+            threadpool.add(SkNEW_ARGS(SkTestRunnable, (test.detach(), &failCount)));
         }
     }
 
@@ -204,7 +204,7 @@ int test_main() {
     }
 
     // Block until threaded tests finish.
-    cpuTests.wait();
+    threadpool.wait();
 
     if (FLAGS_verbose) {
         SkDebugf("\nFinished %d tests, %d failures, %d skipped. (%d internal tests)",
diff --git a/tools/flags/SkCommonFlags.cpp b/tools/flags/SkCommonFlags.cpp
index 7776c36a59..4d08ba6340 100644
--- a/tools/flags/SkCommonFlags.cpp
+++ b/tools/flags/SkCommonFlags.cpp
@@ -6,6 +6,7 @@
  */
 
 #include "SkCommonFlags.h"
+#include "SkThreadPool.h"
 
 DEFINE_string(config, "565 8888 pdf gpu nonrendering angle",
               "Options: 565 8888 pdf gpu nonrendering msaa4 msaa16 nvprmsaa4 nvprmsaa16 "
@@ -41,10 +42,12 @@ DEFINE_bool(resetGpuContext, true, "Reset the GrContext before running each test
 DEFINE_bool(abandonGpuContext, false, "Abandon the GrContext after running each test. "
                                       "Implies --resetGpuContext.");
 
+DEFINE_bool2(single, z, false, "run tests on a single thread internally.");
+
 DEFINE_string(skps, "skps", "Directory to read skps from.");
 
-DEFINE_int32(threads, 0, "Run threadsafe tests on a threadpool with this many threads, "
-                         "defaulting to one thread per core.");
+DEFINE_int32(threads, SkThreadPool::kThreadPerCore,
+             "run threadsafe tests on a threadpool with this many threads.");
 
 DEFINE_bool2(verbose, v, false, "enable verbose output from the test driver.");
 
diff --git a/tools/flags/SkCommonFlags.h b/tools/flags/SkCommonFlags.h
index ecd4148448..b45ef0738a 100644
--- a/tools/flags/SkCommonFlags.h
+++ b/tools/flags/SkCommonFlags.h
@@ -20,6 +20,7 @@ DECLARE_string(match);
 DECLARE_bool(quiet);
 DECLARE_bool(resetGpuContext);
 DECLARE_bool(abandonGpuContext);
+DECLARE_bool(single);
 DECLARE_string(skps);
 DECLARE_int32(threads);
 DECLARE_string(resourcePath);
diff --git a/tools/iOSShell.cpp b/tools/iOSShell.cpp
index 8656e645aa..49be34d65b 100644
--- a/tools/iOSShell.cpp
+++ b/tools/iOSShell.cpp
@@ -12,6 +12,7 @@
 #include "SkCanvas.h"
 #include "SkCommonFlags.h"
 #include "SkGraphics.h"
+#include "SkThreadPool.h"
 #include "SkWindow.h"
 #include "sk_tool_utils.h"
 
diff --git a/tools/skpdiff/SkDiffContext.cpp b/tools/skpdiff/SkDiffContext.cpp
index 78d8400968..42d20de19d 100644
--- a/tools/skpdiff/SkDiffContext.cpp
+++ b/tools/skpdiff/SkDiffContext.cpp
@@ -12,7 +12,7 @@
 #include "SkSize.h"
 #include "SkStream.h"
 #include "SkTDict.h"
-#include "SkTaskGroup.h"
+#include "SkThreadPool.h"
 
 // from the tools directory for replace_char(...)
 #include "picture_utils.h"
@@ -24,6 +24,7 @@
 SkDiffContext::SkDiffContext() {
     fDiffers = NULL;
     fDifferCount = 0;
+    fThreadCount = SkThreadPool::kThreadPerCore;
 }
 
 SkDiffContext::~SkDiffContext() {
@@ -86,7 +87,7 @@ static SkString get_common_prefix(const SkString& a, const SkString& b) {
 }
 
 static SkString get_combined_name(const SkString& a, const SkString& b) {
-    // Note (stephana): We must keep this function in sync with
+    // Note (stephana): We must keep this function in sync with 
     // getImageDiffRelativeUrl() in static/loader.js (under rebaseline_server).
     SkString result = a;
     result.append("-vs-");
@@ -237,7 +238,7 @@ void SkDiffContext::diffDirectories(const char baselinePath[], const char testPa
         return;
     }
 
-    SkTaskGroup tg;
+    SkThreadPool threadPool(fThreadCount);
     SkTArray<SkThreadedDiff> runnableDiffs;
     runnableDiffs.reset(baselineEntries.count());
 
@@ -252,11 +253,13 @@ void SkDiffContext::diffDirectories(const char baselinePath[], const char testPa
         if (sk_exists(testFile.c_str()) && !sk_isdir(testFile.c_str())) {
             // Queue up the comparison with the differ
             runnableDiffs[x].setup(this, baselineFile, testFile);
-            tg.add(&runnableDiffs[x]);
+            threadPool.add(&runnableDiffs[x]);
         } else {
             SkDebugf("Baseline file \"%s\" has no corresponding test file\n", baselineFile.c_str());
         }
     }
+
+    threadPool.wait();
 }
 
 
@@ -281,14 +284,16 @@ void SkDiffContext::diffPatterns(const char baselinePattern[], const char testPa
         return;
     }
 
-    SkTaskGroup tg;
+    SkThreadPool threadPool(fThreadCount);
     SkTArray<SkThreadedDiff> runnableDiffs;
     runnableDiffs.reset(baselineEntries.count());
 
     for (int x = 0; x < baselineEntries.count(); x++) {
         runnableDiffs[x].setup(this, baselineEntries[x], testEntries[x]);
-        tg.add(&runnableDiffs[x]);
+        threadPool.add(&runnableDiffs[x]);
     }
+
+    threadPool.wait();
 }
 
 void SkDiffContext::outputRecords(SkWStream& stream, bool useJSONP) {
diff --git a/tools/valgrind.supp b/tools/valgrind.supp
index f0331ed4cb..d96ada04e7 100644
--- a/tools/valgrind.supp
+++ b/tools/valgrind.supp
@@ -213,12 +213,3 @@
     Memcheck:Addr2
     fun:_ZN22GrAAConvexPathRenderer10onDrawPathERK6SkPathRK11SkStrokeRecP12GrDrawTargetb
 }
-
-{
-    sk_task_group_thread_pool_intentionally_leaks_in_Release_mode
-    Memcheck:Leak
-    ...
-    fun:_ZN8SkThreadC1EPFvPvES0_
-    ...
-    fun:_ZN11SkTaskGroup3addEP10SkRunnable
-}
author	mtklein <mtklein@google.com>	2014-09-03 14:17:48 -0700
committer	Commit bot <commit-bot@chromium.org>	2014-09-03 14:17:48 -0700
commit	2460bbdfbb1d55ef307c3189c661e65de1a7affb (patch)
tree	62342a335023875d1482447bea9e29e8a0ba22fb
parent	9c7207b5dc71dc5a96a2eb107d401133333d5b6f (diff)