aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--bench/SkRasterPipelineBench.cpp185
-rw-r--r--src/core/SkOpts.cpp33
-rw-r--r--src/core/SkOpts.h20
-rw-r--r--src/core/SkRasterPipeline.h54
-rw-r--r--src/core/SkRasterPipelineBlitter.cpp227
-rw-r--r--src/opts/SkOpts_sse41.cpp31
-rw-r--r--src/opts/SkRasterPipeline_opts.h333
-rw-r--r--tests/SkRasterPipelineTest.cpp28
8 files changed, 458 insertions, 453 deletions
diff --git a/bench/SkRasterPipelineBench.cpp b/bench/SkRasterPipelineBench.cpp
index aa3e355e45..17a93da549 100644
--- a/bench/SkRasterPipelineBench.cpp
+++ b/bench/SkRasterPipelineBench.cpp
@@ -6,8 +6,8 @@
*/
#include "Benchmark.h"
+#include "SkOpts.h"
#include "SkRasterPipeline.h"
-#include "SkSRGB.h"
static const int N = 1023;
@@ -22,186 +22,21 @@ static uint8_t mask[N];
// - src = srcover(dst, src)
// - store src back as srgb
-SK_RASTER_STAGE(load_s_srgb) {
- auto ptr = (const uint32_t*)ctx + x;
-
- if (tail) {
- float rs[] = {0,0,0,0},
- gs[] = {0,0,0,0},
- bs[] = {0,0,0,0},
- as[] = {0,0,0,0};
- for (size_t i = 0; i < (tail&3); i++) {
- rs[i] = sk_linear_from_srgb[(ptr[i] >> 0) & 0xff];
- gs[i] = sk_linear_from_srgb[(ptr[i] >> 8) & 0xff];
- bs[i] = sk_linear_from_srgb[(ptr[i] >> 16) & 0xff];
- as[i] = (ptr[i] >> 24) * (1/255.0f);
- }
- r = Sk4f::Load(rs);
- g = Sk4f::Load(gs);
- b = Sk4f::Load(bs);
- a = Sk4f::Load(as);
- return;
- }
-
- r = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff],
- sk_linear_from_srgb[(ptr[1] >> 0) & 0xff],
- sk_linear_from_srgb[(ptr[2] >> 0) & 0xff],
- sk_linear_from_srgb[(ptr[3] >> 0) & 0xff] };
-
- g = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 8) & 0xff],
- sk_linear_from_srgb[(ptr[1] >> 8) & 0xff],
- sk_linear_from_srgb[(ptr[2] >> 8) & 0xff],
- sk_linear_from_srgb[(ptr[3] >> 8) & 0xff] };
-
- b = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 16) & 0xff],
- sk_linear_from_srgb[(ptr[1] >> 16) & 0xff],
- sk_linear_from_srgb[(ptr[2] >> 16) & 0xff],
- sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] };
- a = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f);
-}
-
-SK_RASTER_STAGE(load_d_srgb) {
- auto ptr = (const uint32_t*)ctx + x;
-
- if (tail) {
- float rs[] = {0,0,0,0},
- gs[] = {0,0,0,0},
- bs[] = {0,0,0,0},
- as[] = {0,0,0,0};
- for (size_t i = 0; i < (tail&3); i++) {
- rs[i] = sk_linear_from_srgb[(ptr[i] >> 0) & 0xff];
- gs[i] = sk_linear_from_srgb[(ptr[i] >> 8) & 0xff];
- bs[i] = sk_linear_from_srgb[(ptr[i] >> 16) & 0xff];
- as[i] = (ptr[i] >> 24) * (1/255.0f);
- }
- dr = Sk4f::Load(rs);
- dg = Sk4f::Load(gs);
- db = Sk4f::Load(bs);
- da = Sk4f::Load(as);
- return;
- }
-
- dr = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff],
- sk_linear_from_srgb[(ptr[1] >> 0) & 0xff],
- sk_linear_from_srgb[(ptr[2] >> 0) & 0xff],
- sk_linear_from_srgb[(ptr[3] >> 0) & 0xff] };
-
- dg = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 8) & 0xff],
- sk_linear_from_srgb[(ptr[1] >> 8) & 0xff],
- sk_linear_from_srgb[(ptr[2] >> 8) & 0xff],
- sk_linear_from_srgb[(ptr[3] >> 8) & 0xff] };
-
- db = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 16) & 0xff],
- sk_linear_from_srgb[(ptr[1] >> 16) & 0xff],
- sk_linear_from_srgb[(ptr[2] >> 16) & 0xff],
- sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] };
-
- da = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f);
-}
-
-SK_RASTER_STAGE(scale_u8) {
- auto ptr = (const uint8_t*)ctx + x;
-
- Sk4b cov;
-
- if (tail) {
- uint8_t cs[] = {0,0,0,0};
- switch (tail&3) {
- case 3: cs[2] = ptr[2];
- case 2: cs[1] = ptr[1];
- case 1: cs[0] = ptr[0];
- }
- cov = Sk4b::Load(cs);
- } else {
- cov = Sk4b::Load(ptr);
- }
-
- auto c = SkNx_cast<float>(cov) * (1/255.0f);
- r *= c;
- g *= c;
- b *= c;
- a *= c;
-}
-
-SK_RASTER_STAGE(srcover) {
- auto A = 1.0f - a;
- r += dr * A;
- g += dg * A;
- b += db * A;
- a += da * A;
-}
-
-SK_RASTER_STAGE(store_srgb) {
- auto ptr = (uint32_t*)ctx + x;
-
- uint32_t* dst = nullptr;
- uint32_t stack[4];
-
- if (tail) {
- dst = ptr;
- ptr = stack;
- }
-
- ( sk_linear_to_srgb(r)
- | sk_linear_to_srgb(g) << 8
- | sk_linear_to_srgb(b) << 16
- | Sk4f_round(255.0f*a) << 24).store(ptr);
-
- switch (tail&3) {
- case 3: dst[2] = ptr[2];
- case 2: dst[1] = ptr[1];
- case 1: dst[0] = ptr[0];
- }
-}
-
class SkRasterPipelineBench : public Benchmark {
public:
- SkRasterPipelineBench(bool fused) : fFused(fused) {}
-
bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
- const char* onGetName() override { return fFused ? "SkRasterPipelineBench_fused"
- : "SkRasterPipelineBench_pipeline"; }
+ const char* onGetName() override { return "SkRasterPipeline"; }
void onDraw(int loops, SkCanvas*) override {
while (loops --> 0) {
- fFused ? this->runFused() : this->runPipeline();
+ SkRasterPipeline p;
+ p.append(SkOpts::load_s_srgb_body, SkOpts::load_s_srgb_tail, src);
+ p.append(SkOpts::scale_u8_body, SkOpts::scale_u8_tail, mask);
+ p.append(SkOpts::load_d_srgb_body, SkOpts::load_d_srgb_tail, dst);
+ p.append(SkOpts::srcover);
+ p.append(SkOpts::store_srgb_body, SkOpts::store_srgb_tail, dst);
+ p.run(N);
}
}
-
- void runFused() {
- Sk4f r,g,b,a, dr,dg,db,da;
- size_t x = 0, n = N;
- while (n >= 4) {
- load_s_srgb(src , x,0, r,g,b,a, dr,dg,db,da);
- scale_u8 (mask , x,0, r,g,b,a, dr,dg,da,da);
- load_d_srgb(dst , x,0, r,g,b,a, dr,dg,da,da);
- srcover (nullptr, x,0, r,g,b,a, dr,dg,da,da);
- store_srgb (dst , x,0, r,g,b,a, dr,dg,da,da);
-
- x += 4;
- n -= 4;
- }
- if (n > 0) {
- load_s_srgb(src , x,n, r,g,b,a, dr,dg,db,da);
- scale_u8 (mask , x,n, r,g,b,a, dr,dg,da,da);
- load_d_srgb(dst , x,n, r,g,b,a, dr,dg,da,da);
- srcover (nullptr, x,n, r,g,b,a, dr,dg,da,da);
- store_srgb (dst , x,n, r,g,b,a, dr,dg,da,da);
- }
- }
-
- void runPipeline() {
- SkRasterPipeline p;
- p.append<load_s_srgb>(src);
- p.append< scale_u8>(mask);
- p.append<load_d_srgb>(dst);
- p.append< srcover>();
- p.last < store_srgb>(dst);
- p.run(N);
- }
-
- bool fFused;
};
-
-DEF_BENCH( return new SkRasterPipelineBench(true); )
-DEF_BENCH( return new SkRasterPipelineBench(false); )
+DEF_BENCH( return new SkRasterPipelineBench; )
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index 9ecad15416..1f686ff56b 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -43,6 +43,7 @@
#include "SkChecksum_opts.h"
#include "SkColorCubeFilter_opts.h"
#include "SkMorphologyImageFilter_opts.h"
+#include "SkRasterPipeline_opts.h"
#include "SkSwizzler_opts.h"
#include "SkTextureCompressor_opts.h"
#include "SkXfermode_opts.h"
@@ -89,6 +90,38 @@ namespace SkOpts {
DEFINE_DEFAULT(hash_fn);
#undef DEFINE_DEFAULT
+// Stages that are not sensitive to the tail parameter can be represented by one function.
+#define DEFINE_DEFAULT(stage, kCallNext) \
+ decltype(stage) stage = body<SK_OPTS_NS::stage, kCallNext>
+
+ DEFINE_DEFAULT(srcover, true);
+ DEFINE_DEFAULT(constant_color, true);
+ DEFINE_DEFAULT(lerp_constant_float, true);
+#undef DEFINE_DEFAULT
+
+// Stages that are sensitive to the tail parameter need two versions, _body and _tail.
+#define DEFINE_DEFAULT(stage, kCallNext) \
+ decltype(stage##_body) stage##_body = body<SK_OPTS_NS::stage, kCallNext>; \
+ decltype(stage##_tail) stage##_tail = tail<SK_OPTS_NS::stage, kCallNext>
+
+ DEFINE_DEFAULT(load_d_srgb, true);
+ DEFINE_DEFAULT(load_s_srgb, true);
+ DEFINE_DEFAULT( store_srgb, false);
+
+ DEFINE_DEFAULT(load_d_f16, true);
+ DEFINE_DEFAULT(load_s_f16, true);
+ DEFINE_DEFAULT( store_f16, false);
+
+ DEFINE_DEFAULT(load_d_565, true);
+ DEFINE_DEFAULT(load_s_565, true);
+ DEFINE_DEFAULT( store_565, false);
+
+ DEFINE_DEFAULT(scale_u8, true);
+
+ DEFINE_DEFAULT(lerp_u8, true);
+ DEFINE_DEFAULT(lerp_565, true);
+#undef DEFINE_DEFAULT
+
// Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
void Init_ssse3();
void Init_sse41();
diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h
index 44e337d950..c310a79aa0 100644
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@@ -8,6 +8,7 @@
#ifndef SkOpts_DEFINED
#define SkOpts_DEFINED
+#include "SkRasterPipeline.h"
#include "SkTextureCompressor.h"
#include "SkTypes.h"
#include "SkXfermode.h"
@@ -71,6 +72,25 @@ namespace SkOpts {
static inline uint32_t hash(const void* data, size_t bytes, uint32_t seed=0) {
return hash_fn(data, bytes, seed);
}
+
+ // Each of the SkRasterPipeline::Fn's lists its context pointer type in the comments.
+
+ extern SkRasterPipeline::Fn srcover, // (none)
+ constant_color, // const SkPM4f*
+ lerp_constant_float; // const float*, in [0,1]
+
+ extern SkRasterPipeline::Fn load_d_srgb_body, load_d_srgb_tail, // const uint32_t*
+ load_s_srgb_body, load_s_srgb_tail, // const uint32_t*
+ store_srgb_body, store_srgb_tail, // uint32_t*
+ load_d_f16_body, load_d_f16_tail, // const uint64_t*
+ load_s_f16_body, load_s_f16_tail, // const uint64_t*
+ store_f16_body, store_f16_tail, // uint64_t*
+ load_d_565_body, load_d_565_tail, // const uint16_t*
+ load_s_565_body, load_s_565_tail, // const uint16_t*
+ store_565_body, store_565_tail, // uint16_t*
+ scale_u8_body, scale_u8_tail, // const uint8_t*
+ lerp_u8_body, lerp_u8_tail, // const uint8_t*
+ lerp_565_body, lerp_565_tail; // const uint16_t*
}
#endif//SkOpts_DEFINED
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 7e934f1731..9f4dcb34da 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -48,14 +48,6 @@
*
* Some stages that typically return are those that write a color to a destination pointer,
* but any stage can short-circuit the rest of the pipeline by returning instead of calling next().
- *
- * Most simple pipeline stages can use the SK_RASTER_STAGE macro to define a static EasyFn,
- * which simplifies the user interface a bit:
- * - the context pointer is available directly as the first parameter;
- * - instead of manually calling a next() function, just modify registers in place.
- *
- * To add an EasyFn stage to the pipeline, call append<fn>() instead of append(&fn).
- * It's a slight performance benefit to call last<fn>() for the last stage of a pipeline.
*/
// TODO: There may be a better place to stuff tail, e.g. in the bottom alignment bits of
@@ -66,9 +58,6 @@ public:
struct Stage;
using Fn = void(SK_VECTORCALL *)(Stage*, size_t, size_t, Sk4f,Sk4f,Sk4f,Sk4f,
Sk4f,Sk4f,Sk4f,Sk4f);
- using EasyFn = void(void*, size_t, size_t, Sk4f&, Sk4f&, Sk4f&, Sk4f&,
- Sk4f&, Sk4f&, Sk4f&, Sk4f&);
-
struct Stage {
template <typename T>
T ctx() { return static_cast<T>(fCtx); }
@@ -99,17 +88,6 @@ public:
void append(Fn body, Fn tail, const void* ctx = nullptr);
void append(Fn fn, const void* ctx = nullptr) { this->append(fn, fn, ctx); }
- // Version of append that can be used with static EasyFn (see SK_RASTER_STAGE).
- template <EasyFn fn>
- void append(const void* ctx = nullptr) {
- this->append(Body<fn,true>, Tail<fn,true>, ctx);
- }
-
- // If this is the last stage of the pipeline, last() is a bit faster than append().
- template <EasyFn fn>
- void last(const void* ctx = nullptr) {
- this->append(Body<fn,false>, Tail<fn,false>, ctx);
- }
// Append all stages to this pipeline.
void extend(const SkRasterPipeline&);
@@ -122,42 +100,10 @@ private:
// buggy pipeline can't walk off its own end.
static void SK_VECTORCALL JustReturn(Stage*, size_t, size_t, Sk4f,Sk4f,Sk4f,Sk4f,
Sk4f,Sk4f,Sk4f,Sk4f);
-
- template <EasyFn kernel, bool kCallNext>
- static void SK_VECTORCALL Body(SkRasterPipeline::Stage* st, size_t x, size_t tail,
- Sk4f r, Sk4f g, Sk4f b, Sk4f a,
- Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
- // Passing 0 lets the optimizer completely drop any "if (tail) {...}" code in kernel.
- kernel(st->ctx<void*>(), x,0, r,g,b,a, dr,dg,db,da);
- if (kCallNext) {
- st->next(x,tail, r,g,b,a, dr,dg,db,da); // It's faster to pass tail here than 0.
- }
- }
-
- template <EasyFn kernel, bool kCallNext>
- static void SK_VECTORCALL Tail(SkRasterPipeline::Stage* st, size_t x, size_t tail,
- Sk4f r, Sk4f g, Sk4f b, Sk4f a,
- Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
- #if defined(__clang__)
- __builtin_assume(tail > 0); // This flourish lets Clang compile away any tail==0 code.
- #endif
- kernel(st->ctx<void*>(), x,tail, r,g,b,a, dr,dg,db,da);
- if (kCallNext) {
- st->next(x,tail, r,g,b,a, dr,dg,db,da);
- }
- }
-
Stages fBody,
fTail;
Fn fBodyStart = &JustReturn,
fTailStart = &JustReturn;
};
-// These are always static, and we _really_ want them to inline.
-// If you find yourself wanting a non-inline stage, write a SkRasterPipeline::Fn directly.
-#define SK_RASTER_STAGE(name) \
- static SK_ALWAYS_INLINE void name(void* ctx, size_t x, size_t tail, \
- Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a, \
- Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da)
-
#endif//SkRasterPipeline_DEFINED
diff --git a/src/core/SkRasterPipelineBlitter.cpp b/src/core/SkRasterPipelineBlitter.cpp
index 46820d3379..2ada336cd8 100644
--- a/src/core/SkRasterPipelineBlitter.cpp
+++ b/src/core/SkRasterPipelineBlitter.cpp
@@ -8,11 +8,10 @@
#include "SkBlitter.h"
#include "SkColor.h"
#include "SkColorFilter.h"
-#include "SkHalf.h"
+#include "SkOpts.h"
#include "SkPM4f.h"
#include "SkRasterPipeline.h"
#include "SkShader.h"
-#include "SkSRGB.h"
#include "SkXfermode.h"
@@ -57,200 +56,6 @@ SkBlitter* SkCreateRasterPipelineBlitter(const SkPixmap& dst,
return SkRasterPipelineBlitter::Create(dst, paint, alloc);
}
-// Clamp colors into [0,1] premul (e.g. just before storing back to memory).
-SK_RASTER_STAGE(clamp_01_premul) {
- a = Sk4f::Max(a, 0.0f);
- r = Sk4f::Max(r, 0.0f);
- g = Sk4f::Max(g, 0.0f);
- b = Sk4f::Max(b, 0.0f);
-
- a = Sk4f::Min(a, 1.0f);
- r = Sk4f::Min(r, a);
- g = Sk4f::Min(g, a);
- b = Sk4f::Min(b, a);
-}
-
-// The default shader produces a constant color (from the SkPaint).
-SK_RASTER_STAGE(constant_color) {
- auto color = (const SkPM4f*)ctx;
- r = color->r();
- g = color->g();
- b = color->b();
- a = color->a();
-}
-
-// The default transfer mode is srcover, s' = s + d*(1-sa).
-SK_RASTER_STAGE(srcover) {
- r += dr*(1.0f - a);
- g += dg*(1.0f - a);
- b += db*(1.0f - a);
- a += da*(1.0f - a);
-}
-
-static Sk4f lerp(const Sk4f& from, const Sk4f& to, const Sk4f& cov) {
- return from + (to-from)*cov;
-}
-
-// s' = d(1-c) + sc, for a constant c.
-SK_RASTER_STAGE(lerp_constant_float) {
- Sk4f c = *(const float*)ctx;
-
- r = lerp(dr, r, c);
- g = lerp(dg, g, c);
- b = lerp(db, b, c);
- a = lerp(da, a, c);
-}
-
-template <typename T>
-static SkNx<4,T> load_tail(size_t tail, const T* src) {
- if (tail) {
- return SkNx<4,T>(src[0], (tail>1 ? src[1] : 0), (tail>2 ? src[2] : 0), 0);
- }
- return SkNx<4,T>::Load(src);
-}
-
-template <typename T>
-static void store_tail(size_t tail, const SkNx<4,T>& v, T* dst) {
- switch(tail) {
- case 0: return v.store(dst);
- case 3: dst[2] = v[2];
- case 2: dst[1] = v[1];
- case 1: dst[0] = v[0];
- }
-}
-
-// s' = d(1-c) + sc for 8-bit c.
-SK_RASTER_STAGE(lerp_a8) {
- auto ptr = (const uint8_t*)ctx + x;
-
- Sk4f c = SkNx_cast<float>(load_tail(tail, ptr)) * (1/255.0f);
- r = lerp(dr, r, c);
- g = lerp(dg, g, c);
- b = lerp(db, b, c);
- a = lerp(da, a, c);
-}
-
-static void from_565(const Sk4h& _565, Sk4f* r, Sk4f* g, Sk4f* b) {
- Sk4i _32_bit = SkNx_cast<int>(_565);
-
- *r = SkNx_cast<float>(_32_bit & SK_R16_MASK_IN_PLACE) * (1.0f / SK_R16_MASK_IN_PLACE);
- *g = SkNx_cast<float>(_32_bit & SK_G16_MASK_IN_PLACE) * (1.0f / SK_G16_MASK_IN_PLACE);
- *b = SkNx_cast<float>(_32_bit & SK_B16_MASK_IN_PLACE) * (1.0f / SK_B16_MASK_IN_PLACE);
-}
-
-static Sk4h to_565(const Sk4f& r, const Sk4f& g, const Sk4f& b) {
- return SkNx_cast<uint16_t>( Sk4f_round(r * SK_R16_MASK) << SK_R16_SHIFT
- | Sk4f_round(g * SK_G16_MASK) << SK_G16_SHIFT
- | Sk4f_round(b * SK_B16_MASK) << SK_B16_SHIFT);
-}
-
-// s' = d(1-c) + sc for 565 c.
-SK_RASTER_STAGE(lerp_lcd16) {
- auto ptr = (const uint16_t*)ctx + x;
- Sk4f cr, cg, cb;
- from_565(load_tail(tail, ptr), &cr, &cg, &cb);
-
- r = lerp(dr, r, cr);
- g = lerp(dg, g, cg);
- b = lerp(db, b, cb);
- a = 1.0f;
-}
-
-SK_RASTER_STAGE(load_d_565) {
- auto ptr = (const uint16_t*)ctx + x;
- from_565(load_tail(tail, ptr), &dr,&dg,&db);
- da = 1.0f;
-}
-
-SK_RASTER_STAGE(store_565) {
- auto ptr = (uint16_t*)ctx + x;
- store_tail(tail, to_565(r,g,b), ptr);
-}
-
-SK_RASTER_STAGE(load_d_f16) {
- auto ptr = (const uint64_t*)ctx + x;
-
- if (tail) {
- auto p0 = SkHalfToFloat_finite_ftz(ptr[0]) ,
- p1 = tail>1 ? SkHalfToFloat_finite_ftz(ptr[1]) : Sk4f{0},
- p2 = tail>2 ? SkHalfToFloat_finite_ftz(ptr[2]) : Sk4f{0};
- dr = { p0[0],p1[0],p2[0],0 };
- dg = { p0[1],p1[1],p2[1],0 };
- db = { p0[2],p1[2],p2[2],0 };
- da = { p0[3],p1[3],p2[3],0 };
- return;
- }
-
- Sk4h rh, gh, bh, ah;
- Sk4h_load4(ptr, &rh, &gh, &bh, &ah);
- dr = SkHalfToFloat_finite_ftz(rh);
- dg = SkHalfToFloat_finite_ftz(gh);
- db = SkHalfToFloat_finite_ftz(bh);
- da = SkHalfToFloat_finite_ftz(ah);
-}
-
-SK_RASTER_STAGE(store_f16) {
- auto ptr = (uint64_t*)ctx + x;
-
- switch (tail) {
- case 0: return Sk4h_store4(ptr, SkFloatToHalf_finite_ftz(r), SkFloatToHalf_finite_ftz(g),
- SkFloatToHalf_finite_ftz(b), SkFloatToHalf_finite_ftz(a));
-
- case 3: SkFloatToHalf_finite_ftz({r[2], g[2], b[2], a[2]}).store(ptr+2);
- case 2: SkFloatToHalf_finite_ftz({r[1], g[1], b[1], a[1]}).store(ptr+1);
- case 1: SkFloatToHalf_finite_ftz({r[0], g[0], b[0], a[0]}).store(ptr+0);
- }
-}
-
-// Load 8-bit SkPMColor-order sRGB.
-SK_RASTER_STAGE(load_d_srgb) {
- auto ptr = (const uint32_t*)ctx + x;
-
- if (tail) {
- float rs[] = {0,0,0,0},
- gs[] = {0,0,0,0},
- bs[] = {0,0,0,0},
- as[] = {0,0,0,0};
- for (size_t i = 0; i < tail; i++) {
- rs[i] = sk_linear_from_srgb[(ptr[i] >> SK_R32_SHIFT) & 0xff];
- gs[i] = sk_linear_from_srgb[(ptr[i] >> SK_G32_SHIFT) & 0xff];
- bs[i] = sk_linear_from_srgb[(ptr[i] >> SK_B32_SHIFT) & 0xff];
- as[i] = (1/255.0f) * (ptr[i] >> SK_A32_SHIFT) ;
- }
- dr = Sk4f::Load(rs);
- dg = Sk4f::Load(gs);
- db = Sk4f::Load(bs);
- da = Sk4f::Load(as);
- return;
- }
-
- dr = { sk_linear_from_srgb[(ptr[0] >> SK_R32_SHIFT) & 0xff],
- sk_linear_from_srgb[(ptr[1] >> SK_R32_SHIFT) & 0xff],
- sk_linear_from_srgb[(ptr[2] >> SK_R32_SHIFT) & 0xff],
- sk_linear_from_srgb[(ptr[3] >> SK_R32_SHIFT) & 0xff] };
-
- dg = { sk_linear_from_srgb[(ptr[0] >> SK_G32_SHIFT) & 0xff],
- sk_linear_from_srgb[(ptr[1] >> SK_G32_SHIFT) & 0xff],
- sk_linear_from_srgb[(ptr[2] >> SK_G32_SHIFT) & 0xff],
- sk_linear_from_srgb[(ptr[3] >> SK_G32_SHIFT) & 0xff] };
-
- db = { sk_linear_from_srgb[(ptr[0] >> SK_B32_SHIFT) & 0xff],
- sk_linear_from_srgb[(ptr[1] >> SK_B32_SHIFT) & 0xff],
- sk_linear_from_srgb[(ptr[2] >> SK_B32_SHIFT) & 0xff],
- sk_linear_from_srgb[(ptr[3] >> SK_B32_SHIFT) & 0xff] };
-
- da = SkNx_cast<float>(Sk4u::Load(ptr) >> SK_A32_SHIFT) * (1/255.0f);
-}
-
-// Store 8-bit SkPMColor-order sRGB.
-SK_RASTER_STAGE(store_srgb) {
- auto ptr = (uint32_t*)ctx + x;
- store_tail(tail, ( sk_linear_to_srgb_noclamp(r) << SK_R32_SHIFT
- | sk_linear_to_srgb_noclamp(g) << SK_G32_SHIFT
- | sk_linear_to_srgb_noclamp(b) << SK_B32_SHIFT
- | Sk4f_round(255.0f * a) << SK_A32_SHIFT), (int*)ptr);
-}
-
static bool supported(const SkImageInfo& info) {
switch (info.colorType()) {
case kN32_SkColorType: return info.gammaCloseToSRGB();
@@ -297,10 +102,10 @@ SkBlitter* SkRasterPipelineBlitter::Create(const SkPixmap& dst,
color.premul());
if (!paint.getShader()) {
- blitter->fShader.append<constant_color>(&blitter->fPaintColor);
+ blitter->fShader.append(SkOpts::constant_color, &blitter->fPaintColor);
}
if (!paint.getXfermode()) {
- blitter->fXfermode.append<srcover>();
+ blitter->fXfermode.append(SkOpts::srcover);
}
return blitter;
@@ -312,41 +117,33 @@ void SkRasterPipelineBlitter::append_load_d(SkRasterPipeline* p, const void* dst
switch (fDst.info().colorType()) {
case kN32_SkColorType:
if (fDst.info().gammaCloseToSRGB()) {
- p->append<load_d_srgb>(dst);
+ p->append(SkOpts::load_d_srgb_body, SkOpts::load_d_srgb_tail, dst);
}
break;
case kRGBA_F16_SkColorType:
- p->append<load_d_f16>(dst);
+ p->append(SkOpts::load_d_f16_body, SkOpts::load_d_f16_tail, dst);
break;
case kRGB_565_SkColorType:
- p->append<load_d_565>(dst);
+ p->append(SkOpts::load_d_565_body, SkOpts::load_d_565_tail, dst);
break;
default: break;
}
}
-template <SkRasterPipeline::EasyFn fn>
-static void clamp_01_premul_then(void* ctx, size_t x, size_t tail,
- Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a,
- Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da) {
- clamp_01_premul(nullptr, x,tail, r,g,b,a, dr,dg,db,da);
- fn( ctx, x,tail, r,g,b,a, dr,dg,db,da);
-}
-
void SkRasterPipelineBlitter::append_store(SkRasterPipeline* p, void* dst) const {
SkASSERT(supported(fDst.info()));
switch (fDst.info().colorType()) {
case kN32_SkColorType:
if (fDst.info().gammaCloseToSRGB()) {
- p->last<clamp_01_premul_then<store_srgb>>(dst);
+ p->append(SkOpts::store_srgb_body, SkOpts::store_srgb_tail, dst);
}
break;
case kRGBA_F16_SkColorType:
- p->last<clamp_01_premul_then<store_f16>>(dst);
+ p->append(SkOpts::store_f16_body, SkOpts::store_f16_tail, dst);
break;
case kRGB_565_SkColorType:
- p->last<clamp_01_premul_then<store_565>>(dst);
+ p->append(SkOpts::store_565_body, SkOpts::store_565_tail, dst);
break;
default: break;
}
@@ -374,7 +171,7 @@ void SkRasterPipelineBlitter::blitAntiH(int x, int y, const SkAlpha aa[], const
p.extend(fColorFilter);
this->append_load_d(&p, dst);
p.extend(fXfermode);
- p.append<lerp_constant_float>(&coverage);
+ p.append(SkOpts::lerp_constant_float, &coverage);
this->append_store(&p, dst);
for (int16_t run = *runs; run > 0; run = *runs) {
@@ -404,10 +201,10 @@ void SkRasterPipelineBlitter::blitMask(const SkMask& mask, const SkIRect& clip)
p.extend(fXfermode);
switch (mask.fFormat) {
case SkMask::kA8_Format:
- p.append<lerp_a8>(mask.getAddr8(x,y)-x);
+ p.append(SkOpts::lerp_u8_body, SkOpts::lerp_u8_tail, mask.getAddr8(x,y)-x);
break;
case SkMask::kLCD16_Format:
- p.append<lerp_lcd16>(mask.getAddrLCD16(x,y)-x);
+ p.append(SkOpts::lerp_565_body, SkOpts::lerp_565_tail, mask.getAddrLCD16(x,y)-x);
break;
default: break;
}
diff --git a/src/opts/SkOpts_sse41.cpp b/src/opts/SkOpts_sse41.cpp
index 17ce0668ff..e4e3246780 100644
--- a/src/opts/SkOpts_sse41.cpp
+++ b/src/opts/SkOpts_sse41.cpp
@@ -11,6 +11,7 @@
#include "SkBlurImageFilter_opts.h"
#include "SkBlitRow_opts.h"
#include "SkBlend_opts.h"
+#include "SkRasterPipeline_opts.h"
namespace SkOpts {
void Init_sse41() {
@@ -19,5 +20,35 @@ namespace SkOpts {
box_blur_yx = sse41::box_blur_yx;
srcover_srgb_srgb = sse41::srcover_srgb_srgb;
blit_row_s32a_opaque = sse41::blit_row_s32a_opaque;
+
+ #define STAGE(stage, kCallNext) \
+ stage = body<SK_OPTS_NS::stage, kCallNext>
+
+ STAGE(srcover, true);
+ STAGE(constant_color, true);
+ STAGE(lerp_constant_float, true);
+ #undef STAGE
+
+ #define STAGE(stage, kCallNext) \
+ stage##_body = body<SK_OPTS_NS::stage, kCallNext>; \
+ stage##_tail = tail<SK_OPTS_NS::stage, kCallNext>
+
+ STAGE(load_d_srgb, true);
+ STAGE(load_s_srgb, true);
+ STAGE( store_srgb, false);
+
+ STAGE(load_d_f16, true);
+ STAGE(load_s_f16, true);
+ STAGE( store_f16, false);
+
+ STAGE(load_d_565, true);
+ STAGE(load_s_565, true);
+ STAGE( store_565, false);
+
+ STAGE(scale_u8, true);
+
+ STAGE(lerp_u8, true);
+ STAGE(lerp_565, true);
+ #undef STAGE
}
}
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
new file mode 100644
index 0000000000..70c4d0c225
--- /dev/null
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -0,0 +1,333 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkRasterPipeline_opts_DEFINED
+#define SkRasterPipeline_opts_DEFINED
+
+#include "SkHalf.h"
+#include "SkPM4f.h"
+#include "SkRasterPipeline.h"
+#include "SkSRGB.h"
+
+using Kernel_Sk4f = void(void*, size_t, size_t, Sk4f&, Sk4f&, Sk4f&, Sk4f&,
+ Sk4f&, Sk4f&, Sk4f&, Sk4f&);
+
+// These are always static, and we _really_ want them to inline.
+// If you find yourself wanting a non-inline stage, write a SkRasterPipeline::Fn directly.
+#define KERNEL_Sk4f(name) \
+ static SK_ALWAYS_INLINE void name(void* ctx, size_t x, size_t tail, \
+ Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a, \
+ Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da)
+
+
+template <Kernel_Sk4f kernel, bool kCallNext>
+static inline void SK_VECTORCALL body(SkRasterPipeline::Stage* st, size_t x, size_t t,
+ Sk4f r, Sk4f g, Sk4f b, Sk4f a,
+ Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+ // Passing 0 lets the optimizer completely drop any "if (tail) {...}" code in kernel.
+ kernel(st->ctx<void*>(), x,0, r,g,b,a, dr,dg,db,da);
+ if (kCallNext) {
+ st->next(x,t, r,g,b,a, dr,dg,db,da); // It's faster to pass t here than 0.
+ }
+}
+
+template <Kernel_Sk4f kernel, bool kCallNext>
+static inline void SK_VECTORCALL tail(SkRasterPipeline::Stage* st, size_t x, size_t t,
+ Sk4f r, Sk4f g, Sk4f b, Sk4f a,
+ Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+#if defined(__clang__)
+ __builtin_assume(t > 0); // This flourish lets Clang compile away any tail==0 code.
+#endif
+ kernel(st->ctx<void*>(), x,t, r,g,b,a, dr,dg,db,da);
+ if (kCallNext) {
+ st->next(x,t, r,g,b,a, dr,dg,db,da);
+ }
+}
+
+namespace SK_OPTS_NS {
+
+ // Clamp colors into [0,1] premul (e.g. just before storing back to memory).
+ static void clamp_01_premul(Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a) {
+ a = Sk4f::Max(a, 0.0f);
+ r = Sk4f::Max(r, 0.0f);
+ g = Sk4f::Max(g, 0.0f);
+ b = Sk4f::Max(b, 0.0f);
+
+ a = Sk4f::Min(a, 1.0f);
+ r = Sk4f::Min(r, a);
+ g = Sk4f::Min(g, a);
+ b = Sk4f::Min(b, a);
+ }
+
+ static Sk4f lerp(const Sk4f& from, const Sk4f& to, const Sk4f& cov) {
+ return from + (to-from)*cov;
+ }
+
+ template <typename T>
+ static SkNx<4,T> load_tail(size_t tail, const T* src) {
+ if (tail) {
+ return SkNx<4,T>(src[0], (tail>1 ? src[1] : 0), (tail>2 ? src[2] : 0), 0);
+ }
+ return SkNx<4,T>::Load(src);
+ }
+
+ template <typename T>
+ static void store_tail(size_t tail, const SkNx<4,T>& v, T* dst) {
+ switch(tail) {
+ case 0: return v.store(dst);
+ case 3: dst[2] = v[2];
+ case 2: dst[1] = v[1];
+ case 1: dst[0] = v[0];
+ }
+ }
+
+ static void from_565(const Sk4h& _565, Sk4f* r, Sk4f* g, Sk4f* b) {
+ Sk4i _32_bit = SkNx_cast<int>(_565);
+
+ *r = SkNx_cast<float>(_32_bit & SK_R16_MASK_IN_PLACE) * (1.0f / SK_R16_MASK_IN_PLACE);
+ *g = SkNx_cast<float>(_32_bit & SK_G16_MASK_IN_PLACE) * (1.0f / SK_G16_MASK_IN_PLACE);
+ *b = SkNx_cast<float>(_32_bit & SK_B16_MASK_IN_PLACE) * (1.0f / SK_B16_MASK_IN_PLACE);
+ }
+
+ static Sk4h to_565(const Sk4f& r, const Sk4f& g, const Sk4f& b) {
+ return SkNx_cast<uint16_t>( Sk4f_round(r * SK_R16_MASK) << SK_R16_SHIFT
+ | Sk4f_round(g * SK_G16_MASK) << SK_G16_SHIFT
+ | Sk4f_round(b * SK_B16_MASK) << SK_B16_SHIFT);
+ }
+
+
+ // The default shader produces a constant color (from the SkPaint).
+ KERNEL_Sk4f(constant_color) {
+ auto color = (const SkPM4f*)ctx;
+ r = color->r();
+ g = color->g();
+ b = color->b();
+ a = color->a();
+ }
+
+ // The default transfer mode is srcover, s' = s + d*(1-sa).
+ KERNEL_Sk4f(srcover) {
+ r += dr*(1.0f - a);
+ g += dg*(1.0f - a);
+ b += db*(1.0f - a);
+ a += da*(1.0f - a);
+ }
+
+ // s' = d(1-c) + sc, for a constant c.
+ KERNEL_Sk4f(lerp_constant_float) {
+ Sk4f c = *(const float*)ctx;
+
+ r = lerp(dr, r, c);
+ g = lerp(dg, g, c);
+ b = lerp(db, b, c);
+ a = lerp(da, a, c);
+ }
+
+ // s' = sc for 8-bit c.
+ KERNEL_Sk4f(scale_u8) {
+ auto ptr = (const uint8_t*)ctx + x;
+
+ Sk4f c = SkNx_cast<float>(load_tail(tail, ptr)) * (1/255.0f);
+ r = r*c;
+ g = g*c;
+ b = b*c;
+ a = a*c;
+ }
+
+ // s' = d(1-c) + sc for 8-bit c.
+ KERNEL_Sk4f(lerp_u8) {
+ auto ptr = (const uint8_t*)ctx + x;
+
+ Sk4f c = SkNx_cast<float>(load_tail(tail, ptr)) * (1/255.0f);
+ r = lerp(dr, r, c);
+ g = lerp(dg, g, c);
+ b = lerp(db, b, c);
+ a = lerp(da, a, c);
+ }
+
+ // s' = d(1-c) + sc for 565 c.
+ KERNEL_Sk4f(lerp_565) {
+ auto ptr = (const uint16_t*)ctx + x;
+ Sk4f cr, cg, cb;
+ from_565(load_tail(tail, ptr), &cr, &cg, &cb);
+
+ r = lerp(dr, r, cr);
+ g = lerp(dg, g, cg);
+ b = lerp(db, b, cb);
+ a = 1.0f;
+ }
+
+ KERNEL_Sk4f(load_d_565) {
+ auto ptr = (const uint16_t*)ctx + x;
+ from_565(load_tail(tail, ptr), &dr,&dg,&db);
+ da = 1.0f;
+ }
+
+ KERNEL_Sk4f(load_s_565) {
+ auto ptr = (const uint16_t*)ctx + x;
+ from_565(load_tail(tail, ptr), &r,&g,&b);
+ a = 1.0f;
+ }
+
+ KERNEL_Sk4f(store_565) {
+ clamp_01_premul(r,g,b,a);
+ auto ptr = (uint16_t*)ctx + x;
+ store_tail(tail, to_565(r,g,b), ptr);
+ }
+
+ KERNEL_Sk4f(load_d_f16) {
+ auto ptr = (const uint64_t*)ctx + x;
+
+ if (tail) {
+ auto p0 = SkHalfToFloat_finite_ftz(ptr[0]) ,
+ p1 = tail>1 ? SkHalfToFloat_finite_ftz(ptr[1]) : Sk4f{0},
+ p2 = tail>2 ? SkHalfToFloat_finite_ftz(ptr[2]) : Sk4f{0};
+ dr = { p0[0],p1[0],p2[0],0 };
+ dg = { p0[1],p1[1],p2[1],0 };
+ db = { p0[2],p1[2],p2[2],0 };
+ da = { p0[3],p1[3],p2[3],0 };
+ return;
+ }
+
+ Sk4h rh, gh, bh, ah;
+ Sk4h_load4(ptr, &rh, &gh, &bh, &ah);
+ dr = SkHalfToFloat_finite_ftz(rh);
+ dg = SkHalfToFloat_finite_ftz(gh);
+ db = SkHalfToFloat_finite_ftz(bh);
+ da = SkHalfToFloat_finite_ftz(ah);
+ }
+
+ KERNEL_Sk4f(load_s_f16) {
+ auto ptr = (const uint64_t*)ctx + x;
+
+ if (tail) {
+ auto p0 = SkHalfToFloat_finite_ftz(ptr[0]) ,
+ p1 = tail>1 ? SkHalfToFloat_finite_ftz(ptr[1]) : Sk4f{0},
+ p2 = tail>2 ? SkHalfToFloat_finite_ftz(ptr[2]) : Sk4f{0};
+ r = { p0[0],p1[0],p2[0],0 };
+ g = { p0[1],p1[1],p2[1],0 };
+ b = { p0[2],p1[2],p2[2],0 };
+ a = { p0[3],p1[3],p2[3],0 };
+ return;
+ }
+
+ Sk4h rh, gh, bh, ah;
+ Sk4h_load4(ptr, &rh, &gh, &bh, &ah);
+ r = SkHalfToFloat_finite_ftz(rh);
+ g = SkHalfToFloat_finite_ftz(gh);
+ b = SkHalfToFloat_finite_ftz(bh);
+ a = SkHalfToFloat_finite_ftz(ah);
+ }
+
+ KERNEL_Sk4f(store_f16) {
+ clamp_01_premul(r,g,b,a);
+ auto ptr = (uint64_t*)ctx + x;
+
+ switch (tail) {
+ case 0: return Sk4h_store4(ptr, SkFloatToHalf_finite_ftz(r),
+ SkFloatToHalf_finite_ftz(g),
+ SkFloatToHalf_finite_ftz(b),
+ SkFloatToHalf_finite_ftz(a));
+
+ case 3: SkFloatToHalf_finite_ftz({r[2], g[2], b[2], a[2]}).store(ptr+2);
+ case 2: SkFloatToHalf_finite_ftz({r[1], g[1], b[1], a[1]}).store(ptr+1);
+ case 1: SkFloatToHalf_finite_ftz({r[0], g[0], b[0], a[0]}).store(ptr+0);
+ }
+ }
+
+
+ // Load 8-bit SkPMColor-order sRGB.
+ KERNEL_Sk4f(load_d_srgb) {
+ auto ptr = (const uint32_t*)ctx + x;
+
+ if (tail) {
+ float rs[] = {0,0,0,0},
+ gs[] = {0,0,0,0},
+ bs[] = {0,0,0,0},
+ as[] = {0,0,0,0};
+ for (size_t i = 0; i < tail; i++) {
+ rs[i] = sk_linear_from_srgb[(ptr[i] >> SK_R32_SHIFT) & 0xff];
+ gs[i] = sk_linear_from_srgb[(ptr[i] >> SK_G32_SHIFT) & 0xff];
+ bs[i] = sk_linear_from_srgb[(ptr[i] >> SK_B32_SHIFT) & 0xff];
+ as[i] = (1/255.0f) * (ptr[i] >> SK_A32_SHIFT) ;
+ }
+ dr = Sk4f::Load(rs);
+ dg = Sk4f::Load(gs);
+ db = Sk4f::Load(bs);
+ da = Sk4f::Load(as);
+ return;
+ }
+
+ dr = { sk_linear_from_srgb[(ptr[0] >> SK_R32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[1] >> SK_R32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[2] >> SK_R32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[3] >> SK_R32_SHIFT) & 0xff] };
+
+ dg = { sk_linear_from_srgb[(ptr[0] >> SK_G32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[1] >> SK_G32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[2] >> SK_G32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[3] >> SK_G32_SHIFT) & 0xff] };
+
+ db = { sk_linear_from_srgb[(ptr[0] >> SK_B32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[1] >> SK_B32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[2] >> SK_B32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[3] >> SK_B32_SHIFT) & 0xff] };
+
+ da = SkNx_cast<float>(Sk4u::Load(ptr) >> SK_A32_SHIFT) * (1/255.0f);
+ }
+
+ KERNEL_Sk4f(load_s_srgb) {
+ auto ptr = (const uint32_t*)ctx + x;
+
+ if (tail) {
+ float rs[] = {0,0,0,0},
+ gs[] = {0,0,0,0},
+ bs[] = {0,0,0,0},
+ as[] = {0,0,0,0};
+ for (size_t i = 0; i < tail; i++) {
+ rs[i] = sk_linear_from_srgb[(ptr[i] >> SK_R32_SHIFT) & 0xff];
+ gs[i] = sk_linear_from_srgb[(ptr[i] >> SK_G32_SHIFT) & 0xff];
+ bs[i] = sk_linear_from_srgb[(ptr[i] >> SK_B32_SHIFT) & 0xff];
+ as[i] = (1/255.0f) * (ptr[i] >> SK_A32_SHIFT) ;
+ }
+ r = Sk4f::Load(rs);
+ g = Sk4f::Load(gs);
+ b = Sk4f::Load(bs);
+ a = Sk4f::Load(as);
+ return;
+ }
+
+ r = { sk_linear_from_srgb[(ptr[0] >> SK_R32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[1] >> SK_R32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[2] >> SK_R32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[3] >> SK_R32_SHIFT) & 0xff] };
+
+ g = { sk_linear_from_srgb[(ptr[0] >> SK_G32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[1] >> SK_G32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[2] >> SK_G32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[3] >> SK_G32_SHIFT) & 0xff] };
+
+ b = { sk_linear_from_srgb[(ptr[0] >> SK_B32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[1] >> SK_B32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[2] >> SK_B32_SHIFT) & 0xff],
+ sk_linear_from_srgb[(ptr[3] >> SK_B32_SHIFT) & 0xff] };
+
+ a = SkNx_cast<float>(Sk4u::Load(ptr) >> SK_A32_SHIFT) * (1/255.0f);
+ }
+
+ KERNEL_Sk4f(store_srgb) {
+ clamp_01_premul(r,g,b,a);
+ auto ptr = (uint32_t*)ctx + x;
+ store_tail(tail, ( sk_linear_to_srgb_noclamp(r) << SK_R32_SHIFT
+ | sk_linear_to_srgb_noclamp(g) << SK_G32_SHIFT
+ | sk_linear_to_srgb_noclamp(b) << SK_B32_SHIFT
+ | Sk4f_round(255.0f * a) << SK_A32_SHIFT), (int*)ptr);
+ }
+
+}
+
+#endif//SkRasterPipeline_opts_DEFINED
diff --git a/tests/SkRasterPipelineTest.cpp b/tests/SkRasterPipelineTest.cpp
index 867baf7918..ccc728e64a 100644
--- a/tests/SkRasterPipelineTest.cpp
+++ b/tests/SkRasterPipelineTest.cpp
@@ -8,25 +8,33 @@
#include "Test.h"
#include "SkRasterPipeline.h"
-SK_RASTER_STAGE(load) {
- auto ptr = (const float*)ctx + x;
+static void SK_VECTORCALL load(SkRasterPipeline::Stage* st, size_t x, size_t tail,
+ Sk4f r, Sk4f g, Sk4f b, Sk4f a,
+ Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+ auto ptr = st->ctx<const float*>() + x;
switch(tail&3) {
case 0: a = Sk4f{ptr[3]};
case 3: b = Sk4f{ptr[2]};
case 2: g = Sk4f{ptr[1]};
case 1: r = Sk4f{ptr[0]};
}
+ st->next(x,tail, r,g,b,a, dr,dg,db,da);
}
-SK_RASTER_STAGE(square) {
+static void SK_VECTORCALL square(SkRasterPipeline::Stage* st, size_t x, size_t tail,
+ Sk4f r, Sk4f g, Sk4f b, Sk4f a,
+ Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
r *= r;
g *= g;
b *= b;
a *= a;
+ st->next(x,tail, r,g,b,a, dr,dg,db,da);
}
-SK_RASTER_STAGE(store) {
- auto ptr = (float*)ctx + x;
+static void SK_VECTORCALL store(SkRasterPipeline::Stage* st, size_t x, size_t tail,
+ Sk4f r, Sk4f g, Sk4f b, Sk4f a,
+ Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+ auto ptr = st->ctx<float*>() + x;
switch (tail&3) {
case 0: ptr[3] = a[0];
case 3: ptr[2] = b[0];
@@ -41,6 +49,8 @@ DEF_TEST(SkRasterPipeline, r) {
// - context pointers (load,store)
// - stages sensitive to the number of pixels (load,store)
// - stages insensitive to the number of pixels (square)
+ // - stages that chain to the next stage (load,square)
+ // - stages that terminate the pipeline (store)
//
// This pipeline loads up some values, squares them, then writes them back to memory.
@@ -48,9 +58,9 @@ DEF_TEST(SkRasterPipeline, r) {
float dst_vals[] = { 0,0,0,0,0 };
SkRasterPipeline p;
- p.append<load>(src_vals);
- p.append<square>();
- p.append<store>(dst_vals);
+ p.append(load, src_vals);
+ p.append(square);
+ p.append(store, dst_vals);
p.run(5);
@@ -71,6 +81,6 @@ DEF_TEST(SkRasterPipeline_nonsense, r) {
// No asserts... just a test that this is safe to run and terminates.
// square() always calls st->next(); this makes sure we've always got something there to call.
SkRasterPipeline p;
- p.append<square>();
+ p.append(square);
p.run(20);
}