diff options
-rw-r--r-- | bench/SkRasterPipelineBench.cpp | 135 | ||||
-rw-r--r-- | src/core/SkRasterPipeline.cpp | 37 | ||||
-rw-r--r-- | src/core/SkRasterPipeline.h | 113 | ||||
-rw-r--r-- | src/core/SkRasterPipelineBlitter.cpp | 173 | ||||
-rw-r--r-- | src/core/SkXfermode.cpp | 8 | ||||
-rw-r--r-- | src/effects/SkArithmeticMode.cpp | 7 | ||||
-rw-r--r-- | tests/SkRasterPipelineTest.cpp | 44 |
7 files changed, 245 insertions, 272 deletions
diff --git a/bench/SkRasterPipelineBench.cpp b/bench/SkRasterPipelineBench.cpp index 5ef64ce603..aa3e355e45 100644 --- a/bench/SkRasterPipelineBench.cpp +++ b/bench/SkRasterPipelineBench.cpp @@ -21,11 +21,28 @@ static uint8_t mask[N]; // - load srgb dst // - src = srcover(dst, src) // - store src back as srgb -// Every stage except for srcover interacts with memory, and so will need _tail variants. SK_RASTER_STAGE(load_s_srgb) { auto ptr = (const uint32_t*)ctx + x; + if (tail) { + float rs[] = {0,0,0,0}, + gs[] = {0,0,0,0}, + bs[] = {0,0,0,0}, + as[] = {0,0,0,0}; + for (size_t i = 0; i < (tail&3); i++) { + rs[i] = sk_linear_from_srgb[(ptr[i] >> 0) & 0xff]; + gs[i] = sk_linear_from_srgb[(ptr[i] >> 8) & 0xff]; + bs[i] = sk_linear_from_srgb[(ptr[i] >> 16) & 0xff]; + as[i] = (ptr[i] >> 24) * (1/255.0f); + } + r = Sk4f::Load(rs); + g = Sk4f::Load(gs); + b = Sk4f::Load(bs); + a = Sk4f::Load(as); + return; + } + r = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff], sk_linear_from_srgb[(ptr[1] >> 0) & 0xff], sk_linear_from_srgb[(ptr[2] >> 0) & 0xff], @@ -40,22 +57,30 @@ SK_RASTER_STAGE(load_s_srgb) { sk_linear_from_srgb[(ptr[1] >> 16) & 0xff], sk_linear_from_srgb[(ptr[2] >> 16) & 0xff], sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] }; - a = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f); } -SK_RASTER_STAGE(load_s_srgb_tail) { - auto ptr = (const uint32_t*)ctx + x; - - r = Sk4f{ sk_linear_from_srgb[(*ptr >> 0) & 0xff], 0,0,0 }; - g = Sk4f{ sk_linear_from_srgb[(*ptr >> 8) & 0xff], 0,0,0 }; - b = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 }; - a = Sk4f{ (*ptr >> 24) * (1/255.0f), 0,0,0 }; -} - SK_RASTER_STAGE(load_d_srgb) { auto ptr = (const uint32_t*)ctx + x; + if (tail) { + float rs[] = {0,0,0,0}, + gs[] = {0,0,0,0}, + bs[] = {0,0,0,0}, + as[] = {0,0,0,0}; + for (size_t i = 0; i < (tail&3); i++) { + rs[i] = sk_linear_from_srgb[(ptr[i] >> 0) & 0xff]; + gs[i] = sk_linear_from_srgb[(ptr[i] >> 8) & 0xff]; + bs[i] = sk_linear_from_srgb[(ptr[i] >> 16) & 0xff]; + as[i] = (ptr[i] >> 24) * (1/255.0f); + } + dr = Sk4f::Load(rs); + dg = Sk4f::Load(gs); + db = Sk4f::Load(bs); + da = Sk4f::Load(as); + return; + } + dr = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff], sk_linear_from_srgb[(ptr[1] >> 0) & 0xff], sk_linear_from_srgb[(ptr[2] >> 0) & 0xff], @@ -74,29 +99,24 @@ SK_RASTER_STAGE(load_d_srgb) { da = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f); } -SK_RASTER_STAGE(load_d_srgb_tail) { - auto ptr = (const uint32_t*)ctx + x; - - dr = Sk4f{ sk_linear_from_srgb[(*ptr >> 0) & 0xff], 0,0,0 }; - dg = Sk4f{ sk_linear_from_srgb[(*ptr >> 8) & 0xff], 0,0,0 }; - db = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 }; - da = Sk4f{ (*ptr >> 24) * (1/255.0f), 0,0,0 }; -} - SK_RASTER_STAGE(scale_u8) { auto ptr = (const uint8_t*)ctx + x; - auto c = SkNx_cast<float>(Sk4b::Load(ptr)) * (1/255.0f); - r *= c; - g *= c; - b *= c; - a *= c; -} + Sk4b cov; -SK_RASTER_STAGE(scale_u8_tail) { - auto ptr = (const uint8_t*)ctx + x; + if (tail) { + uint8_t cs[] = {0,0,0,0}; + switch (tail&3) { + case 3: cs[2] = ptr[2]; + case 2: cs[1] = ptr[1]; + case 1: cs[0] = ptr[0]; + } + cov = Sk4b::Load(cs); + } else { + cov = Sk4b::Load(ptr); + } - auto c = *ptr * (1/255.0f); + auto c = SkNx_cast<float>(cov) * (1/255.0f); r *= c; g *= c; b *= c; @@ -114,19 +134,24 @@ SK_RASTER_STAGE(srcover) { SK_RASTER_STAGE(store_srgb) { auto ptr = (uint32_t*)ctx + x; + uint32_t* dst = nullptr; + uint32_t stack[4]; + + if (tail) { + dst = ptr; + ptr = stack; + } + ( sk_linear_to_srgb(r) | sk_linear_to_srgb(g) << 8 | sk_linear_to_srgb(b) << 16 | Sk4f_round(255.0f*a) << 24).store(ptr); -} - -SK_RASTER_STAGE(store_srgb_tail) { - auto ptr = (uint32_t*)ctx + x; - - Sk4i rgba = sk_linear_to_srgb({r[0], g[0], b[0], 0}); - rgba = {rgba[0], rgba[1], rgba[2], (int)(255.0f * a[0] + 0.5f)}; - SkNx_cast<uint8_t>(rgba).store(ptr); + switch (tail&3) { + case 3: dst[2] = ptr[2]; + case 2: dst[1] = ptr[1]; + case 1: dst[0] = ptr[0]; + } } class SkRasterPipelineBench : public Benchmark { @@ -147,35 +172,31 @@ public: Sk4f r,g,b,a, dr,dg,db,da; size_t x = 0, n = N; while (n >= 4) { - load_s_srgb(src , x, r,g,b,a, dr,dg,db,da); - scale_u8 (mask , x, r,g,b,a, dr,dg,da,da); - load_d_srgb(dst , x, r,g,b,a, dr,dg,da,da); - srcover (nullptr, x, r,g,b,a, dr,dg,da,da); - store_srgb (dst , x, r,g,b,a, dr,dg,da,da); + load_s_srgb(src , x,0, r,g,b,a, dr,dg,db,da); + scale_u8 (mask , x,0, r,g,b,a, dr,dg,da,da); + load_d_srgb(dst , x,0, r,g,b,a, dr,dg,da,da); + srcover (nullptr, x,0, r,g,b,a, dr,dg,da,da); + store_srgb (dst , x,0, r,g,b,a, dr,dg,da,da); x += 4; n -= 4; } - while (n > 0) { - load_s_srgb_tail(src , x, r,g,b,a, dr,dg,db,da); - scale_u8_tail (mask , x, r,g,b,a, dr,dg,da,da); - load_d_srgb_tail(dst , x, r,g,b,a, dr,dg,da,da); - srcover (nullptr, x, r,g,b,a, dr,dg,da,da); - store_srgb_tail (dst , x, r,g,b,a, dr,dg,da,da); - - x += 1; - n -= 1; + if (n > 0) { + load_s_srgb(src , x,n, r,g,b,a, dr,dg,db,da); + scale_u8 (mask , x,n, r,g,b,a, dr,dg,da,da); + load_d_srgb(dst , x,n, r,g,b,a, dr,dg,da,da); + srcover (nullptr, x,n, r,g,b,a, dr,dg,da,da); + store_srgb (dst , x,n, r,g,b,a, dr,dg,da,da); } } void runPipeline() { SkRasterPipeline p; - p.append<load_s_srgb, load_s_srgb_tail>( src); - p.append< scale_u8, scale_u8_tail>(mask); - p.append<load_d_srgb, load_d_srgb_tail>( dst); - p.append<srcover>(); - p.last< store_srgb, store_srgb_tail>( dst); - + p.append<load_s_srgb>(src); + p.append< scale_u8>(mask); + p.append<load_d_srgb>(dst); + p.append< srcover>(); + p.last < store_srgb>(dst); p.run(N); } diff --git a/src/core/SkRasterPipeline.cpp b/src/core/SkRasterPipeline.cpp index c50383af01..71c993dfb8 100644 --- a/src/core/SkRasterPipeline.cpp +++ b/src/core/SkRasterPipeline.cpp @@ -9,29 +9,30 @@ SkRasterPipeline::SkRasterPipeline() {} -void SkRasterPipeline::append(SkRasterPipeline::Fn body_fn, const void* body_ctx, - SkRasterPipeline::Fn tail_fn, const void* tail_ctx) { +void SkRasterPipeline::append(SkRasterPipeline::Fn body, + SkRasterPipeline::Fn tail, + const void* ctx) { // Each stage holds its own context and the next function to call. // So the pipeline itself has to hold onto the first function that starts the pipeline. - (fBody.empty() ? fBodyStart : fBody.back().fNext) = body_fn; - (fTail.empty() ? fTailStart : fTail.back().fNext) = tail_fn; + (fBody.empty() ? fBodyStart : fBody.back().fNext) = body; + (fTail.empty() ? fTailStart : fTail.back().fNext) = tail; // Each last stage starts with its next function set to JustReturn as a safety net. // It'll be overwritten by the next call to append(). - fBody.push_back({ &JustReturn, const_cast<void*>(body_ctx) }); - fTail.push_back({ &JustReturn, const_cast<void*>(tail_ctx) }); + fBody.push_back({ &JustReturn, const_cast<void*>(ctx) }); + fTail.push_back({ &JustReturn, const_cast<void*>(ctx) }); } void SkRasterPipeline::extend(const SkRasterPipeline& src) { SkASSERT(src.fBody.count() == src.fTail.count()); - Fn body_fn = src.fBodyStart, - tail_fn = src.fTailStart; + Fn body = src.fBodyStart, + tail = src.fTailStart; for (int i = 0; i < src.fBody.count(); i++) { - this->append(body_fn, src.fBody[i].fCtx, - tail_fn, src.fTail[i].fCtx); - body_fn = src.fBody[i].fNext; - tail_fn = src.fTail[i].fNext; + SkASSERT(src.fBody[i].fCtx == src.fTail[i].fCtx); + this->append(body, tail, src.fBody[i].fCtx); + body = src.fBody[i].fNext; + tail = src.fTail[i].fNext; } } @@ -40,16 +41,14 @@ void SkRasterPipeline::run(size_t x, size_t n) { Sk4f v; while (n >= 4) { - fBodyStart(fBody.begin(), x, v,v,v,v, v,v,v,v); + fBodyStart(fBody.begin(), x,0, v,v,v,v, v,v,v,v); x += 4; n -= 4; } - while (n > 0) { - fTailStart(fTail.begin(), x, v,v,v,v, v,v,v,v); - x += 1; - n -= 1; + if (n > 0) { + fTailStart(fTail.begin(), x,n, v,v,v,v, v,v,v,v); } } -void SK_VECTORCALL SkRasterPipeline::JustReturn(Stage*, size_t, Sk4f,Sk4f,Sk4f,Sk4f, - Sk4f,Sk4f,Sk4f,Sk4f) {} +void SK_VECTORCALL SkRasterPipeline::JustReturn(Stage*, size_t, size_t, Sk4f,Sk4f,Sk4f,Sk4f, + Sk4f,Sk4f,Sk4f,Sk4f) {} diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h index 03fab998e7..7e934f1731 100644 --- a/src/core/SkRasterPipeline.h +++ b/src/core/SkRasterPipeline.h @@ -26,13 +26,16 @@ * are designed to maximize the amount of data we can pass along the pipeline cheaply. * On many machines all arguments stay in registers the entire time. * - * The meaning of the arguments to Fn are sometimes fixed... + * The meaning of the arguments to Fn are sometimes fixed: * - The Stage* always represents the current stage, mainly providing access to ctx(). - * - The size_t is always the destination x coordinate. If you need y, put it in your context. + * - The first size_t is always the destination x coordinate. + * (If you need y, put it in your context.) + * - The second size_t is always tail: 0 when working on a full 4-pixel slab, + * or 1..3 when using only the bottom 1..3 lanes of each register. * - By the time the shader's done, the first four vectors should hold source red, * green, blue, and alpha, up to 4 pixels' worth each. * - * ...and sometimes flexible: + * Sometimes arguments are flexible: * - In the shader, the first four vectors can be used for anything, e.g. sample coordinates. * - The last four vectors are scratch registers that can be used to communicate between * stages; transfer modes use these to hold the original destination pixel components. @@ -43,7 +46,7 @@ * 1) call st->next() with its mutated arguments, chaining to the next stage of the pipeline; or * 2) return, indicating the pipeline is complete for these pixels. * - * Some obvious stages that typically return are those that write a color to a destination pointer, + * Some stages that typically return are those that write a color to a destination pointer, * but any stage can short-circuit the rest of the pipeline by returning instead of calling next(). * * Most simple pipeline stages can use the SK_RASTER_STAGE macro to define a static EasyFn, @@ -52,26 +55,29 @@ * - instead of manually calling a next() function, just modify registers in place. * * To add an EasyFn stage to the pipeline, call append<fn>() instead of append(&fn). - * For the last stage of a pipeline, it's a slight performance benefit to call last<fn>(). + * It's a slight performance benefit to call last<fn>() for the last stage of a pipeline. */ +// TODO: There may be a better place to stuff tail, e.g. in the bottom alignment bits of +// the Stage*. This mostly matters on 64-bit Windows where every register is precious. + class SkRasterPipeline { public: struct Stage; - using Fn = void(SK_VECTORCALL *)(Stage*, size_t, Sk4f,Sk4f,Sk4f,Sk4f, - Sk4f,Sk4f,Sk4f,Sk4f); - using EasyFn = void(void*, size_t, Sk4f&, Sk4f&, Sk4f&, Sk4f&, - Sk4f&, Sk4f&, Sk4f&, Sk4f&); + using Fn = void(SK_VECTORCALL *)(Stage*, size_t, size_t, Sk4f,Sk4f,Sk4f,Sk4f, + Sk4f,Sk4f,Sk4f,Sk4f); + using EasyFn = void(void*, size_t, size_t, Sk4f&, Sk4f&, Sk4f&, Sk4f&, + Sk4f&, Sk4f&, Sk4f&, Sk4f&); struct Stage { template <typename T> T ctx() { return static_cast<T>(fCtx); } - void SK_VECTORCALL next(size_t x, Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3, - Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) { + void SK_VECTORCALL next(size_t x, size_t tail, Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3, + Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) { // Stages are logically a pipeline, and physically are contiguous in an array. // To get to the next stage, we just increment our pointer to the next array element. - fNext(this+1, x, v0,v1,v2,v3, v4,v5,v6,v7); + fNext(this+1, x,tail, v0,v1,v2,v3, v4,v5,v6,v7); } // It makes next() a good bit cheaper if we hold the next function to call here, @@ -84,50 +90,26 @@ public: SkRasterPipeline(); // Run the pipeline constructed with append(), walking x through [x,x+n), - // generally in 4 pixel steps, but sometimes 1 pixel at a time. + // generally in 4-pixel steps, with perhaps one jagged tail step. void run(size_t x, size_t n); void run(size_t n) { this->run(0, n); } - // Use this append() if your stage is sensitive to the number of pixels you're working with: - // - body will always be called for a full 4 pixels - // - tail will always be called for a single pixel - // Typically this is only an essential distintion for stages that read or write memory. - void append(Fn body, const void* body_ctx, - Fn tail, const void* tail_ctx); - - // Most stages don't actually care if they're working on 4 or 1 pixel. - void append(Fn fn, const void* ctx = nullptr) { - this->append(fn, ctx, fn, ctx); - } + // body() will only be called with tail=0, indicating it always works on a full 4 pixels. + // tail() will only be called with tail=1..3 to handle the jagged end of n%4 pixels. + void append(Fn body, Fn tail, const void* ctx = nullptr); + void append(Fn fn, const void* ctx = nullptr) { this->append(fn, fn, ctx); } - // Most 4 pixel or 1 pixel variants share the same context pointer. - void append(Fn body, Fn tail, const void* ctx = nullptr) { - this->append(body, ctx, tail, ctx); - } - - - // Versions of append that can be used with static EasyFns (see SK_RASTER_STAGE). - template <EasyFn body, EasyFn tail> - void append(const void* body_ctx, const void* tail_ctx) { - this->append(Easy<body>, body_ctx, - Easy<tail>, tail_ctx); - } - template <EasyFn body, EasyFn tail> - void last(const void* body_ctx, const void* tail_ctx) { - this->append(Last<body>, body_ctx, - Last<tail>, tail_ctx); + // Version of append that can be used with static EasyFn (see SK_RASTER_STAGE). + template <EasyFn fn> + void append(const void* ctx = nullptr) { + this->append(Body<fn,true>, Tail<fn,true>, ctx); } + // If this is the last stage of the pipeline, last() is a bit faster than append(). template <EasyFn fn> - void append(const void* ctx = nullptr) { this->append<fn, fn>(ctx, ctx); } - template <EasyFn fn> - void last(const void* ctx = nullptr) { this->last<fn, fn>(ctx, ctx); } - - template <EasyFn body, EasyFn tail> - void append(const void* ctx = nullptr) { this->append<body, tail>(ctx, ctx); } - template <EasyFn body, EasyFn tail> - void last(const void* ctx = nullptr) { this->last<body, tail>(ctx, ctx); } - + void last(const void* ctx = nullptr) { + this->append(Body<fn,false>, Tail<fn,false>, ctx); + } // Append all stages to this pipeline. void extend(const SkRasterPipeline&); @@ -138,22 +120,31 @@ private: // This no-op default makes fBodyStart and fTailStart unconditionally safe to call, // and is always the last stage's fNext as a sort of safety net to make sure even a // buggy pipeline can't walk off its own end. - static void SK_VECTORCALL JustReturn(Stage*, size_t, Sk4f,Sk4f,Sk4f,Sk4f, - Sk4f,Sk4f,Sk4f,Sk4f); + static void SK_VECTORCALL JustReturn(Stage*, size_t, size_t, Sk4f,Sk4f,Sk4f,Sk4f, + Sk4f,Sk4f,Sk4f,Sk4f); - template <EasyFn kernel> - static void SK_VECTORCALL Easy(SkRasterPipeline::Stage* st, size_t x, + template <EasyFn kernel, bool kCallNext> + static void SK_VECTORCALL Body(SkRasterPipeline::Stage* st, size_t x, size_t tail, Sk4f r, Sk4f g, Sk4f b, Sk4f a, Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { - kernel(st->ctx<void*>(), x, r,g,b,a, dr,dg,db,da); - st->next(x, r,g,b,a, dr,dg,db,da); + // Passing 0 lets the optimizer completely drop any "if (tail) {...}" code in kernel. + kernel(st->ctx<void*>(), x,0, r,g,b,a, dr,dg,db,da); + if (kCallNext) { + st->next(x,tail, r,g,b,a, dr,dg,db,da); // It's faster to pass tail here than 0. + } } - template <EasyFn kernel> - static void SK_VECTORCALL Last(SkRasterPipeline::Stage* st, size_t x, + template <EasyFn kernel, bool kCallNext> + static void SK_VECTORCALL Tail(SkRasterPipeline::Stage* st, size_t x, size_t tail, Sk4f r, Sk4f g, Sk4f b, Sk4f a, Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { - kernel(st->ctx<void*>(), x, r,g,b,a, dr,dg,db,da); + #if defined(__clang__) + __builtin_assume(tail > 0); // This flourish lets Clang compile away any tail==0 code. + #endif + kernel(st->ctx<void*>(), x,tail, r,g,b,a, dr,dg,db,da); + if (kCallNext) { + st->next(x,tail, r,g,b,a, dr,dg,db,da); + } } Stages fBody, @@ -164,9 +155,9 @@ private: // These are always static, and we _really_ want them to inline. // If you find yourself wanting a non-inline stage, write a SkRasterPipeline::Fn directly. -#define SK_RASTER_STAGE(name) \ - static SK_ALWAYS_INLINE void name(void* ctx, size_t x, \ - Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a, \ +#define SK_RASTER_STAGE(name) \ + static SK_ALWAYS_INLINE void name(void* ctx, size_t x, size_t tail, \ + Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a, \ Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da) #endif//SkRasterPipeline_DEFINED diff --git a/src/core/SkRasterPipelineBlitter.cpp b/src/core/SkRasterPipelineBlitter.cpp index e407d013f5..d618e8b961 100644 --- a/src/core/SkRasterPipelineBlitter.cpp +++ b/src/core/SkRasterPipelineBlitter.cpp @@ -101,22 +101,29 @@ SK_RASTER_STAGE(lerp_constant_float) { a = lerp(da, a, c); } -// s' = d(1-c) + sc, 4 pixels at a time for 8-bit coverage. -SK_RASTER_STAGE(lerp_a8) { - auto ptr = (const uint8_t*)ctx + x; - Sk4f c = SkNx_cast<float>(Sk4b::Load(ptr)) * (1/255.0f); +template <typename T> +static SkNx<4,T> load_tail(size_t tail, const T* src) { + if (tail) { + return SkNx<4,T>(src[0], (tail>1 ? src[1] : 0), (tail>2 ? src[2] : 0), 0); + } + return SkNx<4,T>::Load(src); +} - r = lerp(dr, r, c); - g = lerp(dg, g, c); - b = lerp(db, b, c); - a = lerp(da, a, c); +template <typename T> +static void store_tail(size_t tail, const SkNx<4,T>& v, T* dst) { + switch(tail) { + case 0: return v.store(dst); + case 3: dst[2] = v[2]; + case 2: dst[1] = v[1]; + case 1: dst[0] = v[0]; + } } -// Tail variant of lerp_a8() handling 1 pixel at a time. -SK_RASTER_STAGE(lerp_a8_1) { +// s' = d(1-c) + sc for 8-bit c. +SK_RASTER_STAGE(lerp_a8) { auto ptr = (const uint8_t*)ctx + x; - Sk4f c = *ptr * (1/255.0f); + Sk4f c = SkNx_cast<float>(load_tail(tail, ptr)) * (1/255.0f); r = lerp(dr, r, c); g = lerp(dg, g, c); b = lerp(db, b, c); @@ -137,11 +144,11 @@ static Sk4h to_565(const Sk4f& r, const Sk4f& g, const Sk4f& b) { | Sk4f_round(b * SK_B16_MASK) << SK_B16_SHIFT); } -// s' = d(1-c) + sc, 4 pixels at a time for 565 coverage. +// s' = d(1-c) + sc for 565 c. SK_RASTER_STAGE(lerp_lcd16) { auto ptr = (const uint16_t*)ctx + x; Sk4f cr, cg, cb; - from_565(Sk4h::Load(ptr), &cr, &cg, &cb); + from_565(load_tail(tail, ptr), &cr, &cg, &cb); r = lerp(dr, r, cr); g = lerp(dg, g, cg); @@ -149,89 +156,74 @@ SK_RASTER_STAGE(lerp_lcd16) { a = 1.0f; } -// Tail variant of lerp_lcd16() handling 1 pixel at a time. -SK_RASTER_STAGE(lerp_lcd16_1) { - auto ptr = (const uint16_t*)ctx + x; - Sk4f cr, cg, cb; - from_565({*ptr,0,0,0}, &cr, &cg, &cb); - - r = lerp(dr, r, cr); - g = lerp(dg, g, cg); - b = lerp(db, b, cb); - a = 1.0f; -} - -// Load 4 565 dst pixels. SK_RASTER_STAGE(load_d_565) { auto ptr = (const uint16_t*)ctx + x; - - from_565(Sk4h::Load(ptr), &dr,&dg,&db); - da = 1.0f; -} - -// Load 1 565 dst pixel. -SK_RASTER_STAGE(load_d_565_1) { - auto ptr = (const uint16_t*)ctx + x; - - from_565({*ptr,0,0,0}, &dr,&dg,&db); + from_565(load_tail(tail, ptr), &dr,&dg,&db); da = 1.0f; } -// Store 4 565 pixels. SK_RASTER_STAGE(store_565) { auto ptr = (uint16_t*)ctx + x; - to_565(r,g,b).store(ptr); + store_tail(tail, to_565(r,g,b), ptr); } -// Store 1 565 pixel. -SK_RASTER_STAGE(store_565_1) { - auto ptr = (uint16_t*)ctx + x; - *ptr = to_565(r,g,b)[0]; -} - -// Load 4 F16 pixels. SK_RASTER_STAGE(load_d_f16) { auto ptr = (const uint64_t*)ctx + x; + if (tail) { + auto p0 = SkHalfToFloat_finite_ftz(ptr[0]) , + p1 = tail>1 ? SkHalfToFloat_finite_ftz(ptr[1]) : Sk4f{0}, + p2 = tail>2 ? SkHalfToFloat_finite_ftz(ptr[2]) : Sk4f{0}; + dr = { p0[0],p1[0],p2[0],0 }; + dg = { p0[1],p1[1],p2[1],0 }; + db = { p0[2],p1[2],p2[2],0 }; + da = { p0[3],p1[3],p2[3],0 }; + return; + } + Sk4h rh, gh, bh, ah; Sk4h_load4(ptr, &rh, &gh, &bh, &ah); - dr = SkHalfToFloat_finite_ftz(rh); dg = SkHalfToFloat_finite_ftz(gh); db = SkHalfToFloat_finite_ftz(bh); da = SkHalfToFloat_finite_ftz(ah); } -// Load 1 F16 pixel. -SK_RASTER_STAGE(load_d_f16_1) { - auto ptr = (const uint64_t*)ctx + x; - - auto p0 = SkHalfToFloat_finite_ftz(ptr[0]); - dr = { p0[0],0,0,0 }; - dg = { p0[1],0,0,0 }; - db = { p0[2],0,0,0 }; - da = { p0[3],0,0,0 }; -} - -// Store 4 F16 pixels. SK_RASTER_STAGE(store_f16) { auto ptr = (uint64_t*)ctx + x; - Sk4h_store4(ptr, SkFloatToHalf_finite_ftz(r), SkFloatToHalf_finite_ftz(g), - SkFloatToHalf_finite_ftz(b), SkFloatToHalf_finite_ftz(a)); -} - -// Store 1 F16 pixel. -SK_RASTER_STAGE(store_f16_1) { - auto ptr = (uint64_t*)ctx + x; + switch (tail) { + case 0: return Sk4h_store4(ptr, SkFloatToHalf_finite_ftz(r), SkFloatToHalf_finite_ftz(g), + SkFloatToHalf_finite_ftz(b), SkFloatToHalf_finite_ftz(a)); - SkFloatToHalf_finite_ftz({r[0], g[0], b[0], a[0]}).store(ptr); + case 3: SkFloatToHalf_finite_ftz({r[2], g[2], b[2], a[2]}).store(ptr+2); + case 2: SkFloatToHalf_finite_ftz({r[1], g[1], b[1], a[1]}).store(ptr+1); + case 1: SkFloatToHalf_finite_ftz({r[0], g[0], b[0], a[0]}).store(ptr+0); + } } -// Load 4 8-bit sRGB pixels from SkPMColor order to RGBA. +// Load 8-bit SkPMColor-order sRGB. SK_RASTER_STAGE(load_d_srgb) { auto ptr = (const uint32_t*)ctx + x; + if (tail) { + float rs[] = {0,0,0,0}, + gs[] = {0,0,0,0}, + bs[] = {0,0,0,0}, + as[] = {0,0,0,0}; + for (size_t i = 0; i < tail; i++) { + rs[i] = sk_linear_from_srgb[(ptr[i] >> SK_R32_SHIFT) & 0xff]; + gs[i] = sk_linear_from_srgb[(ptr[i] >> SK_G32_SHIFT) & 0xff]; + bs[i] = sk_linear_from_srgb[(ptr[i] >> SK_B32_SHIFT) & 0xff]; + as[i] = (1/255.0f) * (ptr[i] >> SK_A32_SHIFT) ; + } + dr = Sk4f::Load(rs); + dg = Sk4f::Load(gs); + db = Sk4f::Load(bs); + da = Sk4f::Load(as); + return; + } + dr = { sk_linear_from_srgb[(ptr[0] >> SK_R32_SHIFT) & 0xff], sk_linear_from_srgb[(ptr[1] >> SK_R32_SHIFT) & 0xff], sk_linear_from_srgb[(ptr[2] >> SK_R32_SHIFT) & 0xff], @@ -250,34 +242,13 @@ SK_RASTER_STAGE(load_d_srgb) { da = SkNx_cast<float>(Sk4u::Load(ptr) >> SK_A32_SHIFT) * (1/255.0f); } -// Tail variant of load_d_srgb() handling 1 pixel at a time. -SK_RASTER_STAGE(load_d_srgb_1) { - auto ptr = (const uint32_t*)ctx + x; - - dr = { sk_linear_from_srgb[(*ptr >> SK_R32_SHIFT) & 0xff], 0,0,0 }; - dg = { sk_linear_from_srgb[(*ptr >> SK_G32_SHIFT) & 0xff], 0,0,0 }; - db = { sk_linear_from_srgb[(*ptr >> SK_B32_SHIFT) & 0xff], 0,0,0 }; - da = { (1/255.0f) * (*ptr >> SK_A32_SHIFT) , 0,0,0 }; -} - -// Write out 4 pixels as 8-bit SkPMColor-order sRGB. +// Store 8-bit SkPMColor-order sRGB. SK_RASTER_STAGE(store_srgb) { auto ptr = (uint32_t*)ctx + x; - ( sk_linear_to_srgb_noclamp(r) << SK_R32_SHIFT - | sk_linear_to_srgb_noclamp(g) << SK_G32_SHIFT - | sk_linear_to_srgb_noclamp(b) << SK_B32_SHIFT - | Sk4f_round(255.0f * a) << SK_A32_SHIFT).store(ptr); -} - -// Tail variant of store_srgb() handling 1 pixel at a time. -SK_RASTER_STAGE(store_srgb_1) { - auto ptr = (uint32_t*)ctx + x; - Sk4i rgb = sk_linear_to_srgb_noclamp(swizzle_rb_if_bgra({ r[0], g[0], b[0], 0.0f })); - - uint32_t rgba; - SkNx_cast<uint8_t>(rgb).store(&rgba); - rgba |= (uint32_t)(255.0f * a[0] + 0.5f) << 24; - *ptr = rgba; + store_tail(tail, ( sk_linear_to_srgb_noclamp(r) << SK_R32_SHIFT + | sk_linear_to_srgb_noclamp(g) << SK_G32_SHIFT + | sk_linear_to_srgb_noclamp(b) << SK_B32_SHIFT + | Sk4f_round(255.0f * a) << SK_A32_SHIFT), (int*)ptr); } static bool supported(const SkImageInfo& info) { @@ -341,14 +312,14 @@ void SkRasterPipelineBlitter::append_load_d(SkRasterPipeline* p, const void* dst switch (fDst.info().colorType()) { case kN32_SkColorType: if (fDst.info().gammaCloseToSRGB()) { - p->append<load_d_srgb, load_d_srgb_1>(dst); + p->append<load_d_srgb>(dst); } break; case kRGBA_F16_SkColorType: - p->append<load_d_f16, load_d_f16_1>(dst); + p->append<load_d_f16>(dst); break; case kRGB_565_SkColorType: - p->append<load_d_565, load_d_565_1>(dst); + p->append<load_d_565>(dst); break; default: break; } @@ -361,14 +332,14 @@ void SkRasterPipelineBlitter::append_store(SkRasterPipeline* p, void* dst) const switch (fDst.info().colorType()) { case kN32_SkColorType: if (fDst.info().gammaCloseToSRGB()) { - p->last<store_srgb, store_srgb_1>(dst); + p->last<store_srgb>(dst); } break; case kRGBA_F16_SkColorType: - p->last<store_f16, store_f16_1>(dst); + p->last<store_f16>(dst); break; case kRGB_565_SkColorType: - p->last<store_565, store_565_1>(dst); + p->last<store_565>(dst); break; default: break; } @@ -426,10 +397,10 @@ void SkRasterPipelineBlitter::blitMask(const SkMask& mask, const SkIRect& clip) p.extend(fXfermode); switch (mask.fFormat) { case SkMask::kA8_Format: - p.append<lerp_a8, lerp_a8_1>(mask.getAddr8(x,y)-x); + p.append<lerp_a8>(mask.getAddr8(x,y)-x); break; case SkMask::kLCD16_Format: - p.append<lerp_lcd16, lerp_lcd16_1>(mask.getAddrLCD16(x,y)-x); + p.append<lerp_lcd16>(mask.getAddrLCD16(x,y)-x); break; default: break; } diff --git a/src/core/SkXfermode.cpp b/src/core/SkXfermode.cpp index 3e7b8bc7c2..2717fab7e9 100644 --- a/src/core/SkXfermode.cpp +++ b/src/core/SkXfermode.cpp @@ -1437,14 +1437,14 @@ static Sk4f inv(const Sk4f& x) { return 1.0f - x; } // Most of these modes apply the same logic kernel to each channel. template <Sk4f kernel(const Sk4f& s, const Sk4f& sa, const Sk4f& d, const Sk4f& da)> -static void SK_VECTORCALL rgba(SkRasterPipeline::Stage* st, size_t x, +static void SK_VECTORCALL rgba(SkRasterPipeline::Stage* st, size_t x, size_t tail, Sk4f r, Sk4f g, Sk4f b, Sk4f a, Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { r = kernel(r,a,dr,da); g = kernel(g,a,dg,da); b = kernel(b,a,db,da); a = kernel(a,a,da,da); - st->next(x, r,g,b,a, dr,dg,db,da); + st->next(x,tail, r,g,b,a, dr,dg,db,da); } #define KERNEL(name) static Sk4f name(const Sk4f& s, const Sk4f& sa, const Sk4f& d, const Sk4f& da) @@ -1468,14 +1468,14 @@ KERNEL(xor_) { return s*inv(da) + d*inv(sa); } // Most of the rest apply the same logic to each color channel, and srcover's logic to alpha. // (darken and lighten can actually go either way, but they're a little faster this way.) template <Sk4f kernel(const Sk4f& s, const Sk4f& sa, const Sk4f& d, const Sk4f& da)> -static void SK_VECTORCALL rgb_srcover(SkRasterPipeline::Stage* st, size_t x, +static void SK_VECTORCALL rgb_srcover(SkRasterPipeline::Stage* st, size_t x, size_t tail, Sk4f r, Sk4f g, Sk4f b, Sk4f a, Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { r = kernel(r,a,dr,da); g = kernel(g,a,dg,da); b = kernel(b,a,db,da); a = a + da*inv(a); - st->next(x, r,g,b,a, dr,dg,db,da); + st->next(x,tail, r,g,b,a, dr,dg,db,da); } KERNEL(colorburn) { diff --git a/src/effects/SkArithmeticMode.cpp b/src/effects/SkArithmeticMode.cpp index 876f34b430..e142533b0d 100644 --- a/src/effects/SkArithmeticMode.cpp +++ b/src/effects/SkArithmeticMode.cpp @@ -45,7 +45,7 @@ public: #endif private: - static void SK_VECTORCALL Stage(SkRasterPipeline::Stage* st, size_t x, + static void SK_VECTORCALL Stage(SkRasterPipeline::Stage* st, size_t x, size_t tail, Sk4f r, Sk4f g, Sk4f b, Sk4f a, Sk4f dr, Sk4f dg, Sk4f db, Sk4f da); @@ -74,7 +74,8 @@ sk_sp<SkFlattenable> SkArithmeticMode_scalar::CreateProc(SkReadBuffer& buffer) { return SkArithmeticMode::Make(k1, k2, k3, k4, enforcePMColor); } -void SK_VECTORCALL SkArithmeticMode_scalar::Stage(SkRasterPipeline::Stage* st, size_t x, +void SK_VECTORCALL SkArithmeticMode_scalar::Stage(SkRasterPipeline::Stage* st, + size_t x, size_t tail, Sk4f r, Sk4f g, Sk4f b, Sk4f a, Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { auto self = st->ctx<const SkArithmeticMode_scalar*>(); @@ -91,7 +92,7 @@ void SK_VECTORCALL SkArithmeticMode_scalar::Stage(SkRasterPipeline::Stage* st, s // A later stage (clamp_01_premul) will pin and fEnforcePMColor for us. - st->next(x, r,g,b,a, dr,dg,db,da); + st->next(x,tail, r,g,b,a, dr,dg,db,da); } void SkArithmeticMode_scalar::xfer32(SkPMColor dst[], const SkPMColor src[], diff --git a/tests/SkRasterPipelineTest.cpp b/tests/SkRasterPipelineTest.cpp index 29fe59d31b..867baf7918 100644 --- a/tests/SkRasterPipelineTest.cpp +++ b/tests/SkRasterPipelineTest.cpp @@ -8,22 +8,16 @@ #include "Test.h" #include "SkRasterPipeline.h" -// load needs two variants, one to load 4 values... SK_RASTER_STAGE(load) { auto ptr = (const float*)ctx + x; - r = Sk4f{ptr[0]}; - g = Sk4f{ptr[1]}; - b = Sk4f{ptr[2]}; - a = Sk4f{ptr[3]}; + switch(tail&3) { + case 0: a = Sk4f{ptr[3]}; + case 3: b = Sk4f{ptr[2]}; + case 2: g = Sk4f{ptr[1]}; + case 1: r = Sk4f{ptr[0]}; + } } -// ...and one to load a single value. -SK_RASTER_STAGE(load_tail) { - auto ptr = (const float*)ctx + x; - r = Sk4f{*ptr}; -} - -// square doesn't really care how many of its inputs are active, nor does it need a context. SK_RASTER_STAGE(square) { r *= r; g *= g; @@ -31,26 +25,22 @@ SK_RASTER_STAGE(square) { a *= a; } -// Like load, store has a _tail variant. SK_RASTER_STAGE(store) { auto ptr = (float*)ctx + x; - ptr[0] = r[0]; - ptr[1] = g[0]; - ptr[2] = b[0]; - ptr[3] = a[0]; -} - -SK_RASTER_STAGE(store_tail) { - auto ptr = (float*)ctx + x; - *ptr = r[0]; + switch (tail&3) { + case 0: ptr[3] = a[0]; + case 3: ptr[2] = b[0]; + case 2: ptr[1] = g[0]; + case 1: ptr[0] = r[0]; + } } DEF_TEST(SkRasterPipeline, r) { // We'll build up and run a simple pipeline that exercises the salient // mechanics of SkRasterPipeline: - // - context pointers - // - stages sensitive to the number of pixels - // - stages insensitive to the number of pixels + // - context pointers (load,store) + // - stages sensitive to the number of pixels (load,store) + // - stages insensitive to the number of pixels (square) // // This pipeline loads up some values, squares them, then writes them back to memory. @@ -58,9 +48,9 @@ DEF_TEST(SkRasterPipeline, r) { float dst_vals[] = { 0,0,0,0,0 }; SkRasterPipeline p; - p.append<load, load_tail>(src_vals); + p.append<load>(src_vals); p.append<square>(); - p.append<store, store_tail>(dst_vals); + p.append<store>(dst_vals); p.run(5); |