diff options
Diffstat (limited to 'src/core/SkThreadedBMPDevice.cpp')
-rw-r--r-- | src/core/SkThreadedBMPDevice.cpp | 232 |
1 files changed, 214 insertions, 18 deletions
diff --git a/src/core/SkThreadedBMPDevice.cpp b/src/core/SkThreadedBMPDevice.cpp index 1cf7fe449a..0e45b9fbf6 100644 --- a/src/core/SkThreadedBMPDevice.cpp +++ b/src/core/SkThreadedBMPDevice.cpp @@ -11,28 +11,222 @@ #include "SkTaskGroup.h" #include "SkVertices.h" -SkThreadedBMPDevice::SkThreadedBMPDevice(const SkBitmap& bitmap, int threads) +#include <mutex> +#include <vector> + +constexpr int MAX_CACHE_LINE = 64; + +// Some basic logics and data structures that are shared across the current experimental schedulers. +class TiledDrawSchedulerBase : public TiledDrawScheduler { +public: + TiledDrawSchedulerBase(int tiles, WorkFunc work) + : fTileCnt(tiles), fIsFinishing(false), fDrawCnt(0), fWork(work) {} + + void signal() override { + fDrawCnt++; + } + void finish() override { + fIsFinishing.store(true, std::memory_order_relaxed); + } + +protected: + const int fTileCnt; + std::atomic<bool> fIsFinishing; + std::atomic<int> fDrawCnt; + WorkFunc fWork; +}; + +class TiledDrawSchedulerBySpinning : public TiledDrawSchedulerBase { +public: + TiledDrawSchedulerBySpinning(int tiles, WorkFunc work) + : TiledDrawSchedulerBase(tiles, work), fScheduleData(tiles) {} + + void signal() final { this->TiledDrawSchedulerBase::signal(); } + void finish() final { this->TiledDrawSchedulerBase::finish(); } + + bool next(int& tileIndex) final { + int& drawIndex = fScheduleData[tileIndex].fDrawIndex; + SkASSERT(drawIndex <= fDrawCnt); + while (true) { + bool isFinishing = fIsFinishing.load(std::memory_order_relaxed); + if (isFinishing && drawIndex >= fDrawCnt) { + return false; + } else if (drawIndex < fDrawCnt) { + fWork(tileIndex, drawIndex++); + return true; + } + } + } + +private: + // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines + struct alignas(MAX_CACHE_LINE) TileScheduleData { + TileScheduleData() : fDrawIndex(0) {} + + int fDrawIndex; // next draw index for this tile + }; + + std::vector<TileScheduleData> fScheduleData; +}; + +class TiledDrawSchedulerFlexible : public TiledDrawSchedulerBase { +public: + TiledDrawSchedulerFlexible(int tiles, WorkFunc work) + : TiledDrawSchedulerBase(tiles, work), fScheduleData(tiles) {} + + void signal() final { this->TiledDrawSchedulerBase::signal(); } + void finish() final { this->TiledDrawSchedulerBase::finish(); } + + bool next(int& tileIndex) final { + int failCnt = 0; + while (true) { + TileScheduleData& scheduleData = fScheduleData[tileIndex]; + bool locked = scheduleData.fMutex.try_lock(); + bool processed = false; + + if (locked) { + if (scheduleData.fDrawIndex < fDrawCnt) { + fWork(tileIndex, scheduleData.fDrawIndex++); + processed = true; + } else { + failCnt += fIsFinishing.load(std::memory_order_relaxed); + } + scheduleData.fMutex.unlock(); + } + + if (processed) { + return true; + } else { + if (failCnt >= fTileCnt) { + return false; + } + tileIndex = (tileIndex + 1) % fTileCnt; + } + } + } + +private: + // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines + struct alignas(MAX_CACHE_LINE) TileScheduleData { + TileScheduleData() : fDrawIndex(0) {} + + int fDrawIndex; // next draw index for this tile + std::mutex fMutex; // the mutex for the thread to acquire + }; + + std::vector<TileScheduleData> fScheduleData; +}; + +class TiledDrawSchedulerBySemaphores : public TiledDrawSchedulerBase { +public: + TiledDrawSchedulerBySemaphores(int tiles, WorkFunc work) + : TiledDrawSchedulerBase(tiles, work), fScheduleData(tiles) {} + + + void signal() final { + this->TiledDrawSchedulerBase::signal(); + signalRoot(); + } + + void finish() final { + this->TiledDrawSchedulerBase::finish(); + signalRoot(); + } + + bool next(int& tileIndex) final { + SkASSERT(tileIndex >= 0 && tileIndex < fTileCnt); + TileScheduleData& scheduleData = fScheduleData[tileIndex]; + while (true) { + scheduleData.fSemaphore.wait(); + int leftChild = (tileIndex + 1) * 2 - 1; + int rightChild = leftChild + 1; + if (leftChild < fTileCnt) { + fScheduleData[leftChild].fSemaphore.signal(); + } + if (rightChild < fTileCnt) { + fScheduleData[rightChild].fSemaphore.signal(); + } + + bool isFinishing = fIsFinishing.load(std::memory_order_relaxed); + if (isFinishing && scheduleData.fDrawIndex >= fDrawCnt) { + return false; + } else { + SkASSERT(scheduleData.fDrawIndex < fDrawCnt); + fWork(tileIndex, scheduleData.fDrawIndex++); + return true; + } + } + } + +private: + // alignas(MAX_CACHE_LINE) to avoid false sharing by cache lines + struct alignas(MAX_CACHE_LINE) TileScheduleData { + TileScheduleData() : fDrawIndex(0) {} + + int fDrawIndex; + SkSemaphore fSemaphore; + }; + + void signalRoot() { + SkASSERT(fTileCnt > 0); + fScheduleData[0].fSemaphore.signal(); + } + + std::vector<TileScheduleData> fScheduleData; +}; + +void SkThreadedBMPDevice::startThreads() { + SkASSERT(fThreadFutures.count() == 0); + SkASSERT(fQueueSize == 0); + + TiledDrawScheduler::WorkFunc work = [this](int tileIndex, int drawIndex){ + auto& element = fQueue[drawIndex]; + if (SkIRect::Intersects(fTileBounds[tileIndex], element.fDrawBounds)) { + element.fDrawFn(fTileBounds[tileIndex]); + } + }; + + // using Scheduler = TiledDrawSchedulerBySemaphores; + // using Scheduler = TiledDrawSchedulerBySpinning; + using Scheduler = TiledDrawSchedulerFlexible; + fScheduler.reset(new Scheduler(fTileCnt, work)); + for(int i = 0; i < fThreadCnt; ++i) { + fThreadFutures.push_back(std::async(std::launch::async, [this, i]() { + int tileIndex = i; + while (fScheduler->next(tileIndex)) {} + })); + } +} + +void SkThreadedBMPDevice::finishThreads() { + fScheduler->finish(); + for(auto& future : fThreadFutures) { + future.wait(); + } + fThreadFutures.reset(); + fQueueSize = 0; + fScheduler.reset(nullptr); +} + +SkThreadedBMPDevice::SkThreadedBMPDevice(const SkBitmap& bitmap, int tiles, int threads) : INHERITED(bitmap) - , fThreadCnt(threads) + , fTileCnt(tiles) + , fThreadCnt(threads <= 0 ? tiles : threads) { // Tiling using stripes for now; we'll explore better tiling in the future. - int h = (bitmap.height() + fThreadCnt - 1) / SkTMax(fThreadCnt, 1); + int h = (bitmap.height() + fTileCnt - 1) / SkTMax(fTileCnt, 1); int w = bitmap.width(); int top = 0; - for(int tid = 0; tid < fThreadCnt; ++tid, top += h) { - fThreadBounds.push_back(SkIRect::MakeLTRB(0, top, w, top + h)); + for(int tid = 0; tid < fTileCnt; ++tid, top += h) { + fTileBounds.push_back(SkIRect::MakeLTRB(0, top, w, top + h)); } + fQueueSize = 0; + startThreads(); } void SkThreadedBMPDevice::flush() { - SkTaskGroup().batch(fThreadCnt, [this](int i) { - for(auto& element : fQueue) { - if (SkIRect::Intersects(fThreadBounds[i], element.fDrawBounds)) { - element.fDrawFn(fThreadBounds[i]); - } - } - }); - fQueue.reset(); + finishThreads(); + startThreads(); } // Having this captured in lambda seems to be faster than saving this in DrawElement @@ -75,14 +269,16 @@ SkIRect SkThreadedBMPDevice::transformDrawBounds(const SkRect& drawBounds) const #define THREADED_DRAW(drawBounds, actualDrawCall) \ do { \ DrawState ds(this); \ - fQueue.push_back({ \ + SkASSERT(fQueueSize < MAX_QUEUE_SIZE); \ + fQueue[fQueueSize++] = { \ this->transformDrawBounds(drawBounds), \ - [=](const SkIRect& threadBounds) { \ - SkRasterClip threadRC; \ - SkDraw draw = ds.getThreadDraw(threadRC, threadBounds); \ + [=](const SkIRect& tileBounds) { \ + SkRasterClip tileRC; \ + SkDraw draw = ds.getThreadDraw(tileRC, tileBounds); \ draw.actualDrawCall; \ }, \ - }); \ + }; \ + fScheduler->signal(); \ } while (false) static inline SkRect get_fast_bounds(const SkRect& r, const SkPaint& p) { |