diff options
author | 2017-06-05 13:36:32 -0400 | |
---|---|---|
committer | 2017-06-05 19:29:57 +0000 | |
commit | 70898afe073c49d8151b25cc5bf234f61c76ffae (patch) | |
tree | b5f9e01a22cf7135a757efcb51cc321871032819 /src/core/SkThreadedBMPDevice.cpp | |
parent | 9653d3aa84505c30aa5440b5629cdb25525666c3 (diff) |
Add TiledDrawScheduler so we can concurrently draw and enque
(instead of finishing enque before draw). The highlight is that we can now
achieve 9x speedup compared to 5x in all our previous approaches
(including multi-picture draw).
The schedulers here are experimental. I'd like to move on to try initializing
once for each draw before further polishing and optimizing the schedule
mechanism.
Bug: skia:
Change-Id: Idc3d030d475af9645c24c5372ff62b9a402206cc
Reviewed-on: https://skia-review.googlesource.com/17826
Reviewed-by: Mike Reed <reed@google.com>
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Yuqian Li <liyuqian@google.com>
Diffstat (limited to 'src/core/SkThreadedBMPDevice.cpp')
-rw-r--r-- | src/core/SkThreadedBMPDevice.cpp | 232 |
1 files changed, 214 insertions, 18 deletions
diff --git a/src/core/SkThreadedBMPDevice.cpp b/src/core/SkThreadedBMPDevice.cpp index 1cf7fe449a..0e45b9fbf6 100644 --- a/src/core/SkThreadedBMPDevice.cpp +++ b/src/core/SkThreadedBMPDevice.cpp @@ -11,28 +11,222 @@ #include "SkTaskGroup.h" #include "SkVertices.h" -SkThreadedBMPDevice::SkThreadedBMPDevice(const SkBitmap& bitmap, int threads) +#include <mutex> +#include <vector> + +constexpr int MAX_CACHE_LINE = 64; + +// Some basic logics and data structures that are shared across the current experimental schedulers. +class TiledDrawSchedulerBase : public TiledDrawScheduler { +public: + TiledDrawSchedulerBase(int tiles, WorkFunc work) + : fTileCnt(tiles), fIsFinishing(false), fDrawCnt(0), fWork(work) {} + + void signal() override { + fDrawCnt++; + } + void finish() override { + fIsFinishing.store(true, std::memory_order_relaxed); + } + +protected: + const int fTileCnt; + std::atomic<bool> fIsFinishing; + std::atomic<int> fDrawCnt; + WorkFunc fWork; +}; + +class TiledDrawSchedulerBySpinning : public TiledDrawSchedulerBase { +public: + TiledDrawSchedulerBySpinning(int tiles, WorkFunc work) + : TiledDrawSchedulerBase(tiles, work), fScheduleData(tiles) {} + + void signal() final { this->TiledDrawSchedulerBase::signal(); } + void finish() final { this->TiledDrawSchedulerBase::finish(); } + + bool next(int& tileIndex) final { + int& drawIndex = fScheduleData[tileIndex].fDrawIndex; + SkASSERT(drawIndex <= fDrawCnt); + while (true) { + bool isFinishing = fIsFinishing.load(std::memory_order_relaxed); + if (isFinishing && drawIndex >= fDrawCnt) { + return false; + } else if (drawIndex < fDrawCnt) { + fWork(tileIndex, drawIndex++); + return true; + } + } + } + +private: + // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines + struct alignas(MAX_CACHE_LINE) TileScheduleData { + TileScheduleData() : fDrawIndex(0) {} + + int fDrawIndex; // next draw index for this tile + }; + + std::vector<TileScheduleData> fScheduleData; +}; + +class TiledDrawSchedulerFlexible : public TiledDrawSchedulerBase { +public: + TiledDrawSchedulerFlexible(int tiles, WorkFunc work) + : TiledDrawSchedulerBase(tiles, work), fScheduleData(tiles) {} + + void signal() final { this->TiledDrawSchedulerBase::signal(); } + void finish() final { this->TiledDrawSchedulerBase::finish(); } + + bool next(int& tileIndex) final { + int failCnt = 0; + while (true) { + TileScheduleData& scheduleData = fScheduleData[tileIndex]; + bool locked = scheduleData.fMutex.try_lock(); + bool processed = false; + + if (locked) { + if (scheduleData.fDrawIndex < fDrawCnt) { + fWork(tileIndex, scheduleData.fDrawIndex++); + processed = true; + } else { + failCnt += fIsFinishing.load(std::memory_order_relaxed); + } + scheduleData.fMutex.unlock(); + } + + if (processed) { + return true; + } else { + if (failCnt >= fTileCnt) { + return false; + } + tileIndex = (tileIndex + 1) % fTileCnt; + } + } + } + +private: + // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines + struct alignas(MAX_CACHE_LINE) TileScheduleData { + TileScheduleData() : fDrawIndex(0) {} + + int fDrawIndex; // next draw index for this tile + std::mutex fMutex; // the mutex for the thread to acquire + }; + + std::vector<TileScheduleData> fScheduleData; +}; + +class TiledDrawSchedulerBySemaphores : public TiledDrawSchedulerBase { +public: + TiledDrawSchedulerBySemaphores(int tiles, WorkFunc work) + : TiledDrawSchedulerBase(tiles, work), fScheduleData(tiles) {} + + + void signal() final { + this->TiledDrawSchedulerBase::signal(); + signalRoot(); + } + + void finish() final { + this->TiledDrawSchedulerBase::finish(); + signalRoot(); + } + + bool next(int& tileIndex) final { + SkASSERT(tileIndex >= 0 && tileIndex < fTileCnt); + TileScheduleData& scheduleData = fScheduleData[tileIndex]; + while (true) { + scheduleData.fSemaphore.wait(); + int leftChild = (tileIndex + 1) * 2 - 1; + int rightChild = leftChild + 1; + if (leftChild < fTileCnt) { + fScheduleData[leftChild].fSemaphore.signal(); + } + if (rightChild < fTileCnt) { + fScheduleData[rightChild].fSemaphore.signal(); + } + + bool isFinishing = fIsFinishing.load(std::memory_order_relaxed); + if (isFinishing && scheduleData.fDrawIndex >= fDrawCnt) { + return false; + } else { + SkASSERT(scheduleData.fDrawIndex < fDrawCnt); + fWork(tileIndex, scheduleData.fDrawIndex++); + return true; + } + } + } + +private: + // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines + struct alignas(MAX_CACHE_LINE) TileScheduleData { + TileScheduleData() : fDrawIndex(0) {} + + int fDrawIndex; + SkSemaphore fSemaphore; + }; + + void signalRoot() { + SkASSERT(fTileCnt > 0); + fScheduleData[0].fSemaphore.signal(); + } + + std::vector<TileScheduleData> fScheduleData; +}; + +void SkThreadedBMPDevice::startThreads() { + SkASSERT(fThreadFutures.count() == 0); + SkASSERT(fQueueSize == 0); + + TiledDrawScheduler::WorkFunc work = [this](int tileIndex, int drawIndex){ + auto& element = fQueue[drawIndex]; + if (SkIRect::Intersects(fTileBounds[tileIndex], element.fDrawBounds)) { + element.fDrawFn(fTileBounds[tileIndex]); + } + }; + + // using Scheduler = TiledDrawSchedulerBySemaphores; + // using Scheduler = TiledDrawSchedulerBySpinning; + using Scheduler = TiledDrawSchedulerFlexible; + fScheduler.reset(new Scheduler(fTileCnt, work)); + for(int i = 0; i < fThreadCnt; ++i) { + fThreadFutures.push_back(std::async(std::launch::async, [this, i]() { + int tileIndex = i; + while (fScheduler->next(tileIndex)) {} + })); + } +} + +void SkThreadedBMPDevice::finishThreads() { + fScheduler->finish(); + for(auto& future : fThreadFutures) { + future.wait(); + } + fThreadFutures.reset(); + fQueueSize = 0; + fScheduler.reset(nullptr); +} + +SkThreadedBMPDevice::SkThreadedBMPDevice(const SkBitmap& bitmap, int tiles, int threads) : INHERITED(bitmap) - , fThreadCnt(threads) + , fTileCnt(tiles) + , fThreadCnt(threads <= 0 ? tiles : threads) { // Tiling using stripes for now; we'll explore better tiling in the future. - int h = (bitmap.height() + fThreadCnt - 1) / SkTMax(fThreadCnt, 1); + int h = (bitmap.height() + fTileCnt - 1) / SkTMax(fTileCnt, 1); int w = bitmap.width(); int top = 0; - for(int tid = 0; tid < fThreadCnt; ++tid, top += h) { - fThreadBounds.push_back(SkIRect::MakeLTRB(0, top, w, top + h)); + for(int tid = 0; tid < fTileCnt; ++tid, top += h) { + fTileBounds.push_back(SkIRect::MakeLTRB(0, top, w, top + h)); } + fQueueSize = 0; + startThreads(); } void SkThreadedBMPDevice::flush() { - SkTaskGroup().batch(fThreadCnt, [this](int i) { - for(auto& element : fQueue) { - if (SkIRect::Intersects(fThreadBounds[i], element.fDrawBounds)) { - element.fDrawFn(fThreadBounds[i]); - } - } - }); - fQueue.reset(); + finishThreads(); + startThreads(); } // Having this captured in lambda seems to be faster than saving this in DrawElement @@ -75,14 +269,16 @@ SkIRect SkThreadedBMPDevice::transformDrawBounds(const SkRect& drawBounds) const #define THREADED_DRAW(drawBounds, actualDrawCall) \ do { \ DrawState ds(this); \ - fQueue.push_back({ \ + SkASSERT(fQueueSize < MAX_QUEUE_SIZE); \ + fQueue[fQueueSize++] = { \ this->transformDrawBounds(drawBounds), \ - [=](const SkIRect& threadBounds) { \ - SkRasterClip threadRC; \ - SkDraw draw = ds.getThreadDraw(threadRC, threadBounds); \ + [=](const SkIRect& tileBounds) { \ + SkRasterClip tileRC; \ + SkDraw draw = ds.getThreadDraw(tileRC, tileBounds); \ draw.actualDrawCall; \ }, \ - }); \ + }; \ + fScheduler->signal(); \ } while (false) static inline SkRect get_fast_bounds(const SkRect& r, const SkPaint& p) { |