Add TiledDrawScheduler so we can concurrently draw and enque

(instead of finishing enque before draw). The highlight is that we can now achieve 9x speedup compared to 5x in all our previous approaches (including multi-picture draw). The schedulers here are experimental. I'd like to move on to try initializing once for each draw before further polishing and optimizing the schedule mechanism. Bug: skia: Change-Id: Idc3d030d475af9645c24c5372ff62b9a402206cc Reviewed-on: https://skia-review.googlesource.com/17826 Reviewed-by: Mike Reed <reed@google.com> Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Yuqian Li <liyuqian@google.com>
author: Yuqian Li <liyuqian@google.com> 2017-06-05 13:36:32 -0400
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2017-06-05 19:29:57 +0000
commit: 70898afe073c49d8151b25cc5bf234f61c76ffae (patch)
tree: b5f9e01a22cf7135a757efcb51cc321871032819 /src
parent: 9653d3aa84505c30aa5440b5629cdb25525666c3 (diff)
2 files changed, 253 insertions, 21 deletions
diff --git a/src/core/SkThreadedBMPDevice.cpp b/src/core/SkThreadedBMPDevice.cpp
index 1cf7fe449a..0e45b9fbf6 100644
--- a/src/core/SkThreadedBMPDevice.cpp
+++ b/src/core/SkThreadedBMPDevice.cpp
@@ -11,28 +11,222 @@
 #include "SkTaskGroup.h"
 #include "SkVertices.h"
 
-SkThreadedBMPDevice::SkThreadedBMPDevice(const SkBitmap& bitmap, int threads)
+#include <mutex>
+#include <vector>
+
+constexpr int MAX_CACHE_LINE = 64;
+
+// Some basic logics and data structures that are shared across the current experimental schedulers.
+class TiledDrawSchedulerBase : public TiledDrawScheduler {
+public:
+    TiledDrawSchedulerBase(int tiles, WorkFunc work)
+            : fTileCnt(tiles), fIsFinishing(false), fDrawCnt(0), fWork(work) {}
+
+    void signal() override {
+        fDrawCnt++;
+    }
+    void finish() override {
+        fIsFinishing.store(true, std::memory_order_relaxed);
+    }
+
+protected:
+    const int                   fTileCnt;
+    std::atomic<bool>           fIsFinishing;
+    std::atomic<int>            fDrawCnt;
+    WorkFunc                    fWork;
+};
+
+class TiledDrawSchedulerBySpinning : public TiledDrawSchedulerBase {
+public:
+    TiledDrawSchedulerBySpinning(int tiles, WorkFunc work)
+            : TiledDrawSchedulerBase(tiles, work), fScheduleData(tiles) {}
+
+    void signal() final { this->TiledDrawSchedulerBase::signal(); }
+    void finish() final { this->TiledDrawSchedulerBase::finish(); }
+
+    bool next(int& tileIndex) final {
+        int& drawIndex = fScheduleData[tileIndex].fDrawIndex;
+        SkASSERT(drawIndex <= fDrawCnt);
+        while (true) {
+            bool isFinishing = fIsFinishing.load(std::memory_order_relaxed);
+            if (isFinishing && drawIndex >= fDrawCnt) {
+                return false;
+            } else if (drawIndex < fDrawCnt) {
+                fWork(tileIndex, drawIndex++);
+                return true;
+            }
+        }
+    }
+
+private:
+    // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines
+    struct alignas(MAX_CACHE_LINE) TileScheduleData {
+        TileScheduleData() : fDrawIndex(0) {}
+
+        int fDrawIndex; // next draw index for this tile
+    };
+
+    std::vector<TileScheduleData>  fScheduleData;
+};
+
+class TiledDrawSchedulerFlexible : public TiledDrawSchedulerBase {
+public:
+    TiledDrawSchedulerFlexible(int tiles, WorkFunc work)
+            : TiledDrawSchedulerBase(tiles, work), fScheduleData(tiles) {}
+
+    void signal() final { this->TiledDrawSchedulerBase::signal(); }
+    void finish() final { this->TiledDrawSchedulerBase::finish(); }
+
+    bool next(int& tileIndex) final {
+        int failCnt = 0;
+        while (true) {
+            TileScheduleData& scheduleData = fScheduleData[tileIndex];
+            bool locked = scheduleData.fMutex.try_lock();
+            bool processed = false;
+
+            if (locked) {
+                if (scheduleData.fDrawIndex < fDrawCnt) {
+                    fWork(tileIndex, scheduleData.fDrawIndex++);
+                    processed = true;
+                } else {
+                    failCnt += fIsFinishing.load(std::memory_order_relaxed);
+                }
+                scheduleData.fMutex.unlock();
+            }
+
+            if (processed) {
+                return true;
+            } else {
+                if (failCnt >= fTileCnt) {
+                    return false;
+                }
+                tileIndex = (tileIndex + 1) % fTileCnt;
+            }
+        }
+    }
+
+private:
+    // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines
+    struct alignas(MAX_CACHE_LINE) TileScheduleData {
+        TileScheduleData() : fDrawIndex(0) {}
+
+        int         fDrawIndex; // next draw index for this tile
+        std::mutex  fMutex;     // the mutex for the thread to acquire
+    };
+
+    std::vector<TileScheduleData>  fScheduleData;
+};
+
+class TiledDrawSchedulerBySemaphores : public TiledDrawSchedulerBase {
+public:
+    TiledDrawSchedulerBySemaphores(int tiles, WorkFunc work)
+            : TiledDrawSchedulerBase(tiles, work), fScheduleData(tiles) {}
+
+
+    void signal() final {
+        this->TiledDrawSchedulerBase::signal();
+        signalRoot();
+    }
+
+    void finish() final {
+        this->TiledDrawSchedulerBase::finish();
+        signalRoot();
+    }
+
+    bool next(int& tileIndex) final {
+        SkASSERT(tileIndex >= 0 && tileIndex < fTileCnt);
+        TileScheduleData& scheduleData = fScheduleData[tileIndex];
+        while (true) {
+            scheduleData.fSemaphore.wait();
+            int leftChild = (tileIndex + 1) * 2 - 1;
+            int rightChild = leftChild + 1;
+            if (leftChild < fTileCnt) {
+                fScheduleData[leftChild].fSemaphore.signal();
+            }
+            if (rightChild < fTileCnt) {
+                fScheduleData[rightChild].fSemaphore.signal();
+            }
+
+            bool isFinishing = fIsFinishing.load(std::memory_order_relaxed);
+            if (isFinishing && scheduleData.fDrawIndex >= fDrawCnt) {
+                return false;
+            } else {
+                SkASSERT(scheduleData.fDrawIndex < fDrawCnt);
+                fWork(tileIndex, scheduleData.fDrawIndex++);
+                return true;
+            }
+        }
+    }
+
+private:
+    // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines
+    struct alignas(MAX_CACHE_LINE) TileScheduleData {
+        TileScheduleData() : fDrawIndex(0) {}
+
+        int         fDrawIndex;
+        SkSemaphore fSemaphore;
+    };
+
+    void signalRoot() {
+        SkASSERT(fTileCnt > 0);
+        fScheduleData[0].fSemaphore.signal();
+    }
+
+    std::vector<TileScheduleData> fScheduleData;
+};
+
+void SkThreadedBMPDevice::startThreads() {
+    SkASSERT(fThreadFutures.count() == 0);
+    SkASSERT(fQueueSize == 0);
+
+    TiledDrawScheduler::WorkFunc work = [this](int tileIndex, int drawIndex){
+        auto& element = fQueue[drawIndex];
+        if (SkIRect::Intersects(fTileBounds[tileIndex], element.fDrawBounds)) {
+            element.fDrawFn(fTileBounds[tileIndex]);
+        }
+    };
+
+    // using Scheduler = TiledDrawSchedulerBySemaphores;
+    // using Scheduler = TiledDrawSchedulerBySpinning;
+    using Scheduler = TiledDrawSchedulerFlexible;
+    fScheduler.reset(new Scheduler(fTileCnt, work));
+    for(int i = 0; i < fThreadCnt; ++i) {
+        fThreadFutures.push_back(std::async(std::launch::async, [this, i]() {
+            int tileIndex = i;
+            while (fScheduler->next(tileIndex)) {}
+        }));
+    }
+}
+
+void SkThreadedBMPDevice::finishThreads() {
+    fScheduler->finish();
+    for(auto& future : fThreadFutures) {
+        future.wait();
+    }
+    fThreadFutures.reset();
+    fQueueSize = 0;
+    fScheduler.reset(nullptr);
+}
+
+SkThreadedBMPDevice::SkThreadedBMPDevice(const SkBitmap& bitmap, int tiles, int threads)
         : INHERITED(bitmap)
-        , fThreadCnt(threads)
+        , fTileCnt(tiles)
+        , fThreadCnt(threads <= 0 ? tiles : threads)
 {
     // Tiling using stripes for now; we'll explore better tiling in the future.
-    int h = (bitmap.height() + fThreadCnt - 1) / SkTMax(fThreadCnt, 1);
+    int h = (bitmap.height() + fTileCnt - 1) / SkTMax(fTileCnt, 1);
     int w = bitmap.width();
     int top = 0;
-    for(int tid = 0; tid < fThreadCnt; ++tid, top += h) {
-        fThreadBounds.push_back(SkIRect::MakeLTRB(0, top, w, top + h));
+    for(int tid = 0; tid < fTileCnt; ++tid, top += h) {
+        fTileBounds.push_back(SkIRect::MakeLTRB(0, top, w, top + h));
     }
+    fQueueSize = 0;
+    startThreads();
 }
 
 void SkThreadedBMPDevice::flush() {
-    SkTaskGroup().batch(fThreadCnt, [this](int i) {
-        for(auto& element : fQueue) {
-            if (SkIRect::Intersects(fThreadBounds[i], element.fDrawBounds)) {
-                element.fDrawFn(fThreadBounds[i]);
-            }
-        }
-    });
-    fQueue.reset();
+    finishThreads();
+    startThreads();
 }
 
 // Having this captured in lambda seems to be faster than saving this in DrawElement
@@ -75,14 +269,16 @@ SkIRect SkThreadedBMPDevice::transformDrawBounds(const SkRect& drawBounds) const
 #define THREADED_DRAW(drawBounds, actualDrawCall)                                                  \
     do {                                                                                           \
         DrawState ds(this);                                                                        \
-        fQueue.push_back({                                                                         \
+        SkASSERT(fQueueSize < MAX_QUEUE_SIZE);                                                     \
+        fQueue[fQueueSize++] = {                                                                   \
             this->transformDrawBounds(drawBounds),                                                 \
-            [=](const SkIRect& threadBounds) {                                                     \
-                SkRasterClip threadRC;                                                             \
-                SkDraw draw = ds.getThreadDraw(threadRC, threadBounds);                            \
+            [=](const SkIRect& tileBounds) {                                                       \
+                SkRasterClip tileRC;                                                               \
+                SkDraw draw = ds.getThreadDraw(tileRC, tileBounds);                                \
                 draw.actualDrawCall;                                                               \
             },                                                                                     \
-        });                                                                                        \
+        };                                                                                         \
+        fScheduler->signal();                                                                      \
     } while (false)
 
 static inline SkRect get_fast_bounds(const SkRect& r, const SkPaint& p) {
diff --git a/src/core/SkThreadedBMPDevice.h b/src/core/SkThreadedBMPDevice.h
index ea69f7a858..e973df2303 100644
--- a/src/core/SkThreadedBMPDevice.h
+++ b/src/core/SkThreadedBMPDevice.h
@@ -11,10 +11,37 @@
 #include "SkDraw.h"
 #include "SkBitmapDevice.h"
 
+#include <future>
+
+class TiledDrawScheduler {
+public:
+    using WorkFunc = std::function<void(int, int)>;
+
+    virtual ~TiledDrawScheduler() {}
+
+    virtual void signal() = 0; // signal that one more draw is available for all tiles
+
+    // Tell scheduler that no more draw calls will be added (no signal will be called).
+    virtual void finish() = 0;
+
+    // Handle the next draw available. This method will block until
+    //   (1) the next draw is finished, or
+    //   (2) the finish is called
+    // The method will return true for case (1) and false for case (2).
+    // When there's no draw available and we haven't called finish, we will just wait.
+    // In many cases, the parameter tileIndex specifies the tile that the next draw should happen.
+    // However, for some schedulers, that tileIndex may only be a hint and the scheduler is free
+    // to find another tile to draw. In that case, tileIndex will be changed to the actual tileIndex
+    // where the draw happens.
+    virtual bool next(int& tileIndex) = 0;
+};
+
 ///////////////////////////////////////////////////////////////////////////////
 class SkThreadedBMPDevice : public SkBitmapDevice {
 public:
-    SkThreadedBMPDevice(const SkBitmap& bitmap, int threads);
+    // When threads = 0, we make fThreadCnt = fTileCnt
+    SkThreadedBMPDevice(const SkBitmap& bitmap, int tiles, int threads = 0);
+    ~SkThreadedBMPDevice() { finishThreads(); }
 
 protected:
     void drawPaint(const SkPaint& paint) override;
@@ -47,9 +74,18 @@ private:
 
     SkIRect transformDrawBounds(const SkRect& drawBounds) const;
 
+    void startThreads();
+    void finishThreads();
+
+    static constexpr int MAX_QUEUE_SIZE = 100000;
+
+    const int fTileCnt;
     const int fThreadCnt;
-    SkTArray<SkIRect> fThreadBounds;
-    SkTArray<DrawElement> fQueue;
+    std::unique_ptr<TiledDrawScheduler> fScheduler;
+    SkTArray<SkIRect> fTileBounds;
+    SkTArray<std::future<void>> fThreadFutures;
+    DrawElement fQueue[MAX_QUEUE_SIZE];
+    int fQueueSize;
 
     typedef SkBitmapDevice INHERITED;
 };
author	Yuqian Li <liyuqian@google.com>	2017-06-05 13:36:32 -0400
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2017-06-05 19:29:57 +0000
commit	70898afe073c49d8151b25cc5bf234f61c76ffae (patch)
tree	b5f9e01a22cf7135a757efcb51cc321871032819 /src
parent	9653d3aa84505c30aa5440b5629cdb25525666c3 (diff)