Add TiledDrawScheduler so we can concurrently draw and enque

(instead of finishing enque before draw). The highlight is that we can now achieve 9x speedup compared to 5x in all our previous approaches (including multi-picture draw). The schedulers here are experimental. I'd like to move on to try initializing once for each draw before further polishing and optimizing the schedule mechanism. Bug: skia: Change-Id: Idc3d030d475af9645c24c5372ff62b9a402206cc Reviewed-on: https://skia-review.googlesource.com/17826 Reviewed-by: Mike Reed <reed@google.com> Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Yuqian Li <liyuqian@google.com>
author: Yuqian Li <liyuqian@google.com> 2017-06-05 13:36:32 -0400
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2017-06-05 19:29:57 +0000
commit: 70898afe073c49d8151b25cc5bf234f61c76ffae (patch)
tree: b5f9e01a22cf7135a757efcb51cc321871032819 /src/core/SkThreadedBMPDevice.cpp
parent: 9653d3aa84505c30aa5440b5629cdb25525666c3 (diff)
1 files changed, 214 insertions, 18 deletions
diff --git a/src/core/SkThreadedBMPDevice.cpp b/src/core/SkThreadedBMPDevice.cpp
index 1cf7fe449a..0e45b9fbf6 100644
--- a/src/core/SkThreadedBMPDevice.cpp
+++ b/src/core/SkThreadedBMPDevice.cpp
@@ -11,28 +11,222 @@
 #include "SkTaskGroup.h"
 #include "SkVertices.h"
 
-SkThreadedBMPDevice::SkThreadedBMPDevice(const SkBitmap& bitmap, int threads)
+#include <mutex>
+#include <vector>
+
+constexpr int MAX_CACHE_LINE = 64;
+
+// Some basic logics and data structures that are shared across the current experimental schedulers.
+class TiledDrawSchedulerBase : public TiledDrawScheduler {
+public:
+    TiledDrawSchedulerBase(int tiles, WorkFunc work)
+            : fTileCnt(tiles), fIsFinishing(false), fDrawCnt(0), fWork(work) {}
+
+    void signal() override {
+        fDrawCnt++;
+    }
+    void finish() override {
+        fIsFinishing.store(true, std::memory_order_relaxed);
+    }
+
+protected:
+    const int                   fTileCnt;
+    std::atomic<bool>           fIsFinishing;
+    std::atomic<int>            fDrawCnt;
+    WorkFunc                    fWork;
+};
+
+class TiledDrawSchedulerBySpinning : public TiledDrawSchedulerBase {
+public:
+    TiledDrawSchedulerBySpinning(int tiles, WorkFunc work)
+            : TiledDrawSchedulerBase(tiles, work), fScheduleData(tiles) {}
+
+    void signal() final { this->TiledDrawSchedulerBase::signal(); }
+    void finish() final { this->TiledDrawSchedulerBase::finish(); }
+
+    bool next(int& tileIndex) final {
+        int& drawIndex = fScheduleData[tileIndex].fDrawIndex;
+        SkASSERT(drawIndex <= fDrawCnt);
+        while (true) {
+            bool isFinishing = fIsFinishing.load(std::memory_order_relaxed);
+            if (isFinishing && drawIndex >= fDrawCnt) {
+                return false;
+            } else if (drawIndex < fDrawCnt) {
+                fWork(tileIndex, drawIndex++);
+                return true;
+            }
+        }
+    }
+
+private:
+    // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines
+    struct alignas(MAX_CACHE_LINE) TileScheduleData {
+        TileScheduleData() : fDrawIndex(0) {}
+
+        int fDrawIndex; // next draw index for this tile
+    };
+
+    std::vector<TileScheduleData>  fScheduleData;
+};
+
+class TiledDrawSchedulerFlexible : public TiledDrawSchedulerBase {
+public:
+    TiledDrawSchedulerFlexible(int tiles, WorkFunc work)
+            : TiledDrawSchedulerBase(tiles, work), fScheduleData(tiles) {}
+
+    void signal() final { this->TiledDrawSchedulerBase::signal(); }
+    void finish() final { this->TiledDrawSchedulerBase::finish(); }
+
+    bool next(int& tileIndex) final {
+        int failCnt = 0;
+        while (true) {
+            TileScheduleData& scheduleData = fScheduleData[tileIndex];
+            bool locked = scheduleData.fMutex.try_lock();
+            bool processed = false;
+
+            if (locked) {
+                if (scheduleData.fDrawIndex < fDrawCnt) {
+                    fWork(tileIndex, scheduleData.fDrawIndex++);
+                    processed = true;
+                } else {
+                    failCnt += fIsFinishing.load(std::memory_order_relaxed);
+                }
+                scheduleData.fMutex.unlock();
+            }
+
+            if (processed) {
+                return true;
+            } else {
+                if (failCnt >= fTileCnt) {
+                    return false;
+                }
+                tileIndex = (tileIndex + 1) % fTileCnt;
+            }
+        }
+    }
+
+private:
+    // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines
+    struct alignas(MAX_CACHE_LINE) TileScheduleData {
+        TileScheduleData() : fDrawIndex(0) {}
+
+        int         fDrawIndex; // next draw index for this tile
+        std::mutex  fMutex;     // the mutex for the thread to acquire
+    };
+
+    std::vector<TileScheduleData>  fScheduleData;
+};
+
+class TiledDrawSchedulerBySemaphores : public TiledDrawSchedulerBase {
+public:
+    TiledDrawSchedulerBySemaphores(int tiles, WorkFunc work)
+            : TiledDrawSchedulerBase(tiles, work), fScheduleData(tiles) {}
+
+
+    void signal() final {
+        this->TiledDrawSchedulerBase::signal();
+        signalRoot();
+    }
+
+    void finish() final {
+        this->TiledDrawSchedulerBase::finish();
+        signalRoot();
+    }
+
+    bool next(int& tileIndex) final {
+        SkASSERT(tileIndex >= 0 && tileIndex < fTileCnt);
+        TileScheduleData& scheduleData = fScheduleData[tileIndex];
+        while (true) {
+            scheduleData.fSemaphore.wait();
+            int leftChild = (tileIndex + 1) * 2 - 1;
+            int rightChild = leftChild + 1;
+            if (leftChild < fTileCnt) {
+                fScheduleData[leftChild].fSemaphore.signal();
+            }
+            if (rightChild < fTileCnt) {
+                fScheduleData[rightChild].fSemaphore.signal();
+            }
+
+            bool isFinishing = fIsFinishing.load(std::memory_order_relaxed);
+            if (isFinishing && scheduleData.fDrawIndex >= fDrawCnt) {
+                return false;
+            } else {
+                SkASSERT(scheduleData.fDrawIndex < fDrawCnt);
+                fWork(tileIndex, scheduleData.fDrawIndex++);
+                return true;
+            }
+        }
+    }
+
+private:
+    // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines
+    struct alignas(MAX_CACHE_LINE) TileScheduleData {
+        TileScheduleData() : fDrawIndex(0) {}
+
+        int         fDrawIndex;
+        SkSemaphore fSemaphore;
+    };
+
+    void signalRoot() {
+        SkASSERT(fTileCnt > 0);
+        fScheduleData[0].fSemaphore.signal();
+    }
+
+    std::vector<TileScheduleData> fScheduleData;
+};
+
+void SkThreadedBMPDevice::startThreads() {
+    SkASSERT(fThreadFutures.count() == 0);
+    SkASSERT(fQueueSize == 0);
+
+    TiledDrawScheduler::WorkFunc work = [this](int tileIndex, int drawIndex){
+        auto& element = fQueue[drawIndex];
+        if (SkIRect::Intersects(fTileBounds[tileIndex], element.fDrawBounds)) {
+            element.fDrawFn(fTileBounds[tileIndex]);
+        }
+    };
+
+    // using Scheduler = TiledDrawSchedulerBySemaphores;
+    // using Scheduler = TiledDrawSchedulerBySpinning;
+    using Scheduler = TiledDrawSchedulerFlexible;
+    fScheduler.reset(new Scheduler(fTileCnt, work));
+    for(int i = 0; i < fThreadCnt; ++i) {
+        fThreadFutures.push_back(std::async(std::launch::async, [this, i]() {
+            int tileIndex = i;
+            while (fScheduler->next(tileIndex)) {}
+        }));
+    }
+}
+
+void SkThreadedBMPDevice::finishThreads() {
+    fScheduler->finish();
+    for(auto& future : fThreadFutures) {
+        future.wait();
+    }
+    fThreadFutures.reset();
+    fQueueSize = 0;
+    fScheduler.reset(nullptr);
+}
+
+SkThreadedBMPDevice::SkThreadedBMPDevice(const SkBitmap& bitmap, int tiles, int threads)
         : INHERITED(bitmap)
-        , fThreadCnt(threads)
+        , fTileCnt(tiles)
+        , fThreadCnt(threads <= 0 ? tiles : threads)
 {
     // Tiling using stripes for now; we'll explore better tiling in the future.
-    int h = (bitmap.height() + fThreadCnt - 1) / SkTMax(fThreadCnt, 1);
+    int h = (bitmap.height() + fTileCnt - 1) / SkTMax(fTileCnt, 1);
     int w = bitmap.width();
     int top = 0;
-    for(int tid = 0; tid < fThreadCnt; ++tid, top += h) {
-        fThreadBounds.push_back(SkIRect::MakeLTRB(0, top, w, top + h));
+    for(int tid = 0; tid < fTileCnt; ++tid, top += h) {
+        fTileBounds.push_back(SkIRect::MakeLTRB(0, top, w, top + h));
     }
+    fQueueSize = 0;
+    startThreads();
 }
 
 void SkThreadedBMPDevice::flush() {
-    SkTaskGroup().batch(fThreadCnt, [this](int i) {
-        for(auto& element : fQueue) {
-            if (SkIRect::Intersects(fThreadBounds[i], element.fDrawBounds)) {
-                element.fDrawFn(fThreadBounds[i]);
-            }
-        }
-    });
-    fQueue.reset();
+    finishThreads();
+    startThreads();
 }
 
 // Having this captured in lambda seems to be faster than saving this in DrawElement
@@ -75,14 +269,16 @@ SkIRect SkThreadedBMPDevice::transformDrawBounds(const SkRect& drawBounds) const
 #define THREADED_DRAW(drawBounds, actualDrawCall)                                                  \
     do {                                                                                           \
         DrawState ds(this);                                                                        \
-        fQueue.push_back({                                                                         \
+        SkASSERT(fQueueSize < MAX_QUEUE_SIZE);                                                     \
+        fQueue[fQueueSize++] = {                                                                   \
             this->transformDrawBounds(drawBounds),                                                 \
-            [=](const SkIRect& threadBounds) {                                                     \
-                SkRasterClip threadRC;                                                             \
-                SkDraw draw = ds.getThreadDraw(threadRC, threadBounds);                            \
+            [=](const SkIRect& tileBounds) {                                                       \
+                SkRasterClip tileRC;                                                               \
+                SkDraw draw = ds.getThreadDraw(tileRC, tileBounds);                                \
                 draw.actualDrawCall;                                                               \
             },                                                                                     \
-        });                                                                                        \
+        };                                                                                         \
+        fScheduler->signal();                                                                      \
     } while (false)
 
 static inline SkRect get_fast_bounds(const SkRect& r, const SkPaint& p) {
author	Yuqian Li <liyuqian@google.com>	2017-06-05 13:36:32 -0400
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2017-06-05 19:29:57 +0000
commit	70898afe073c49d8151b25cc5bf234f61c76ffae (patch)
tree	b5f9e01a22cf7135a757efcb51cc321871032819 /src/core/SkThreadedBMPDevice.cpp
parent	9653d3aa84505c30aa5440b5629cdb25525666c3 (diff)