1 files changed, 214 insertions, 18 deletions
diff --git a/src/core/SkThreadedBMPDevice.cpp b/src/core/SkThreadedBMPDevice.cpp
index 1cf7fe449a..0e45b9fbf6 100644
--- a/src/core/SkThreadedBMPDevice.cpp
+++ b/src/core/SkThreadedBMPDevice.cpp
@@ -11,28 +11,222 @@
 #include "SkTaskGroup.h"
 #include "SkVertices.h"
 
-SkThreadedBMPDevice::SkThreadedBMPDevice(const SkBitmap& bitmap, int threads)
+#include <mutex>
+#include <vector>
+
+constexpr int MAX_CACHE_LINE = 64;
+
+// Some basic logics and data structures that are shared across the current experimental schedulers.
+class TiledDrawSchedulerBase : public TiledDrawScheduler {
+public:
+    TiledDrawSchedulerBase(int tiles, WorkFunc work)
+            : fTileCnt(tiles), fIsFinishing(false), fDrawCnt(0), fWork(work) {}
+
+    void signal() override {
+        fDrawCnt++;
+    }
+    void finish() override {
+        fIsFinishing.store(true, std::memory_order_relaxed);
+    }
+
+protected:
+    const int                   fTileCnt;
+    std::atomic<bool>           fIsFinishing;
+    std::atomic<int>            fDrawCnt;
+    WorkFunc                    fWork;
+};
+
+class TiledDrawSchedulerBySpinning : public TiledDrawSchedulerBase {
+public:
+    TiledDrawSchedulerBySpinning(int tiles, WorkFunc work)
+            : TiledDrawSchedulerBase(tiles, work), fScheduleData(tiles) {}
+
+    void signal() final { this->TiledDrawSchedulerBase::signal(); }
+    void finish() final { this->TiledDrawSchedulerBase::finish(); }
+
+    bool next(int& tileIndex) final {
+        int& drawIndex = fScheduleData[tileIndex].fDrawIndex;
+        SkASSERT(drawIndex <= fDrawCnt);
+        while (true) {
+            bool isFinishing = fIsFinishing.load(std::memory_order_relaxed);
+            if (isFinishing && drawIndex >= fDrawCnt) {
+                return false;
+            } else if (drawIndex < fDrawCnt) {
+                fWork(tileIndex, drawIndex++);
+                return true;
+            }
+        }
+    }
+
+private:
+    // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines
+    struct alignas(MAX_CACHE_LINE) TileScheduleData {
+        TileScheduleData() : fDrawIndex(0) {}
+
+        int fDrawIndex; // next draw index for this tile
+    };
+
+    std::vector<TileScheduleData>  fScheduleData;
+};
+
+class TiledDrawSchedulerFlexible : public TiledDrawSchedulerBase {
+public:
+    TiledDrawSchedulerFlexible(int tiles, WorkFunc work)
+            : TiledDrawSchedulerBase(tiles, work), fScheduleData(tiles) {}
+
+    void signal() final { this->TiledDrawSchedulerBase::signal(); }
+    void finish() final { this->TiledDrawSchedulerBase::finish(); }
+
+    bool next(int& tileIndex) final {
+        int failCnt = 0;
+        while (true) {
+            TileScheduleData& scheduleData = fScheduleData[tileIndex];
+            bool locked = scheduleData.fMutex.try_lock();
+            bool processed = false;
+
+            if (locked) {
+                if (scheduleData.fDrawIndex < fDrawCnt) {
+                    fWork(tileIndex, scheduleData.fDrawIndex++);
+                    processed = true;
+                } else {
+                    failCnt += fIsFinishing.load(std::memory_order_relaxed);
+                }
+                scheduleData.fMutex.unlock();
+            }
+
+            if (processed) {
+                return true;
+            } else {
+                if (failCnt >= fTileCnt) {
+                    return false;
+                }
+                tileIndex = (tileIndex + 1) % fTileCnt;
+            }
+        }
+    }
+
+private:
+    // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines
+    struct alignas(MAX_CACHE_LINE) TileScheduleData {
+        TileScheduleData() : fDrawIndex(0) {}
+
+        int         fDrawIndex; // next draw index for this tile
+        std::mutex  fMutex;     // the mutex for the thread to acquire
+    };
+
+    std::vector<TileScheduleData>  fScheduleData;
+};
+
+class TiledDrawSchedulerBySemaphores : public TiledDrawSchedulerBase {
+public:
+    TiledDrawSchedulerBySemaphores(int tiles, WorkFunc work)
+            : TiledDrawSchedulerBase(tiles, work), fScheduleData(tiles) {}
+
+
+    void signal() final {
+        this->TiledDrawSchedulerBase::signal();
+        signalRoot();
+    }
+
+    void finish() final {
+        this->TiledDrawSchedulerBase::finish();
+        signalRoot();
+    }
+
+    bool next(int& tileIndex) final {
+        SkASSERT(tileIndex >= 0 && tileIndex < fTileCnt);
+        TileScheduleData& scheduleData = fScheduleData[tileIndex];
+        while (true) {
+            scheduleData.fSemaphore.wait();
+            int leftChild = (tileIndex + 1) * 2 - 1;
+            int rightChild = leftChild + 1;
+            if (leftChild < fTileCnt) {
+                fScheduleData[leftChild].fSemaphore.signal();
+            }
+            if (rightChild < fTileCnt) {
+                fScheduleData[rightChild].fSemaphore.signal();
+            }
+
+            bool isFinishing = fIsFinishing.load(std::memory_order_relaxed);
+            if (isFinishing && scheduleData.fDrawIndex >= fDrawCnt) {
+                return false;
+            } else {
+                SkASSERT(scheduleData.fDrawIndex < fDrawCnt);
+                fWork(tileIndex, scheduleData.fDrawIndex++);
+                return true;
+            }
+        }
+    }
+
+private:
+    // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines
+    struct alignas(MAX_CACHE_LINE) TileScheduleData {
+        TileScheduleData() : fDrawIndex(0) {}
+
+        int         fDrawIndex;
+        SkSemaphore fSemaphore;
+    };
+
+    void signalRoot() {
+        SkASSERT(fTileCnt > 0);
+        fScheduleData[0].fSemaphore.signal();
+    }
+
+    std::vector<TileScheduleData> fScheduleData;
+};
+
+void SkThreadedBMPDevice::startThreads() {
+    SkASSERT(fThreadFutures.count() == 0);
+    SkASSERT(fQueueSize == 0);
+
+    TiledDrawScheduler::WorkFunc work = [this](int tileIndex, int drawIndex){
+        auto& element = fQueue[drawIndex];
+        if (SkIRect::Intersects(fTileBounds[tileIndex], element.fDrawBounds)) {
+            element.fDrawFn(fTileBounds[tileIndex]);
+        }
+    };
+
+    // using Scheduler = TiledDrawSchedulerBySemaphores;
+    // using Scheduler = TiledDrawSchedulerBySpinning;
+    using Scheduler = TiledDrawSchedulerFlexible;
+    fScheduler.reset(new Scheduler(fTileCnt, work));
+    for(int i = 0; i < fThreadCnt; ++i) {
+        fThreadFutures.push_back(std::async(std::launch::async, [this, i]() {
+            int tileIndex = i;
+            while (fScheduler->next(tileIndex)) {}
+        }));
+    }
+}
+
+void SkThreadedBMPDevice::finishThreads() {
+    fScheduler->finish();
+    for(auto& future : fThreadFutures) {
+        future.wait();
+    }
+    fThreadFutures.reset();
+    fQueueSize = 0;
+    fScheduler.reset(nullptr);
+}
+
+SkThreadedBMPDevice::SkThreadedBMPDevice(const SkBitmap& bitmap, int tiles, int threads)
         : INHERITED(bitmap)
-        , fThreadCnt(threads)
+        , fTileCnt(tiles)
+        , fThreadCnt(threads <= 0 ? tiles : threads)
 {
     // Tiling using stripes for now; we'll explore better tiling in the future.
-    int h = (bitmap.height() + fThreadCnt - 1) / SkTMax(fThreadCnt, 1);
+    int h = (bitmap.height() + fTileCnt - 1) / SkTMax(fTileCnt, 1);
     int w = bitmap.width();
     int top = 0;
-    for(int tid = 0; tid < fThreadCnt; ++tid, top += h) {
-        fThreadBounds.push_back(SkIRect::MakeLTRB(0, top, w, top + h));
+    for(int tid = 0; tid < fTileCnt; ++tid, top += h) {
+        fTileBounds.push_back(SkIRect::MakeLTRB(0, top, w, top + h));
     }
+    fQueueSize = 0;
+    startThreads();
 }
 
 void SkThreadedBMPDevice::flush() {
-    SkTaskGroup().batch(fThreadCnt, [this](int i) {
-        for(auto& element : fQueue) {
-            if (SkIRect::Intersects(fThreadBounds[i], element.fDrawBounds)) {
-                element.fDrawFn(fThreadBounds[i]);
-            }
-        }
-    });
-    fQueue.reset();
+    finishThreads();
+    startThreads();
 }
 
 // Having this captured in lambda seems to be faster than saving this in DrawElement
@@ -75,14 +269,16 @@ SkIRect SkThreadedBMPDevice::transformDrawBounds(const SkRect& drawBounds) const
 #define THREADED_DRAW(drawBounds, actualDrawCall)                                                  \
     do {                                                                                           \
         DrawState ds(this);                                                                        \
-        fQueue.push_back({                                                                         \
+        SkASSERT(fQueueSize < MAX_QUEUE_SIZE);                                                     \
+        fQueue[fQueueSize++] = {                                                                   \
             this->transformDrawBounds(drawBounds),                                                 \
-            [=](const SkIRect& threadBounds) {                                                     \
-                SkRasterClip threadRC;                                                             \
-                SkDraw draw = ds.getThreadDraw(threadRC, threadBounds);                            \
+            [=](const SkIRect& tileBounds) {                                                       \
+                SkRasterClip tileRC;                                                               \
+                SkDraw draw = ds.getThreadDraw(tileRC, tileBounds);                                \
                 draw.actualDrawCall;                                                               \
             },                                                                                     \
-        });                                                                                        \
+        };                                                                                         \
+        fScheduler->signal();                                                                      \
     } while (false)
 
 static inline SkRect get_fast_bounds(const SkRect& r, const SkPaint& p) {