aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper/SkJumper_stages_lowp.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/jumper/SkJumper_stages_lowp.cpp')
-rw-r--r--src/jumper/SkJumper_stages_lowp.cpp221
1 files changed, 162 insertions, 59 deletions
diff --git a/src/jumper/SkJumper_stages_lowp.cpp b/src/jumper/SkJumper_stages_lowp.cpp
index 79e3fbd0d7..247bdf6a2b 100644
--- a/src/jumper/SkJumper_stages_lowp.cpp
+++ b/src/jumper/SkJumper_stages_lowp.cpp
@@ -42,12 +42,16 @@
using U8 = uint8_t __attribute__((ext_vector_type(16)));
using U16 = uint16_t __attribute__((ext_vector_type(16)));
using I16 = int16_t __attribute__((ext_vector_type(16)));
+ using I32 = int32_t __attribute__((ext_vector_type(16)));
using U32 = uint32_t __attribute__((ext_vector_type(16)));
+ using F = float __attribute__((ext_vector_type(16)));
#else
using U8 = uint8_t __attribute__((ext_vector_type(8)));
using U16 = uint16_t __attribute__((ext_vector_type(8)));
using I16 = int16_t __attribute__((ext_vector_type(8)));
+ using I32 = int32_t __attribute__((ext_vector_type(8)));
using U32 = uint32_t __attribute__((ext_vector_type(8)));
+ using F = float __attribute__((ext_vector_type(8)));
#endif
static const size_t N = sizeof(U16) / sizeof(uint16_t);
@@ -78,7 +82,49 @@ ABI extern "C" void WRAP(start_pipeline)(const size_t x0,
ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t,
U16,U16,U16,U16, U16,U16,U16,U16) {}
-#define STAGE(name, ...) \
+// All stages use the same function call ABI to chain into each other, but there are three types:
+// GG: geometry in, geometry out -- think, a matrix
+// GP: geometry in, pixels out. -- think, a memory gather
+// PP: pixels in, pixels out. -- think, a blend mode
+//
+// (Some stages ignore their inputs or produce no logical output. That's perfectly fine.)
+//
+// These three STAGE_ macros let you define each type of stage,
+// and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate.
+
+#define STAGE_GG(name, ...) \
+ SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
+ ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
+ U16 r, U16 g, U16 b, U16 a, \
+ U16 dr, U16 dg, U16 db, U16 da) { \
+ auto x = join<F>(r,g), \
+ y = join<F>(b,a); \
+ name##_k(Ctx{program}, dx,dy,tail, x,y); \
+ split(x, &r,&g); \
+ split(y, &b,&a); \
+ auto next = (Stage)load_and_inc(program); \
+ next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
+ } \
+ SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
+
+#define STAGE_GP(name, ...) \
+ SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
+ U16& r, U16& g, U16& b, U16& a, \
+ U16& dr, U16& dg, U16& db, U16& da); \
+ ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
+ U16 r, U16 g, U16 b, U16 a, \
+ U16 dr, U16 dg, U16 db, U16 da) { \
+ auto x = join<F>(r,g), \
+ y = join<F>(b,a); \
+ name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \
+ auto next = (Stage)load_and_inc(program); \
+ next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
+ } \
+ SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
+ U16& r, U16& g, U16& b, U16& a, \
+ U16& dr, U16& dg, U16& db, U16& da)
+
+#define STAGE_PP(name, ...) \
SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
U16& r, U16& g, U16& b, U16& a, \
U16& dr, U16& dg, U16& db, U16& da); \
@@ -136,50 +182,72 @@ SI D join(S lo, S hi) {
return v;
}
+// TODO: do we need platform-specific intrinsics for any of these?
+SI F if_then_else(I32 c, F t, F e) {
+ return bit_cast<F>( (bit_cast<I32>(t) & c) | (bit_cast<I32>(e) & ~c) );
+}
+SI F max(F x, F y) { return if_then_else(x < y, y, x); }
+SI F min(F x, F y) { return if_then_else(x < y, x, y); }
+
+SI F mad(F f, F m, F a) { return f*m+a; }
+SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
+
// ~~~~~~ Basic / misc. stages ~~~~~~ //
-STAGE(uniform_color, const SkJumper_UniformColorCtx* c) {
+STAGE_GG(seed_shader, const float* iota) {
+ x = cast<F>(I32(dx)) + unaligned_load<F>(iota);
+ y = cast<F>(I32(dy)) + 0.5f;
+}
+
+STAGE_GG(matrix_2x3, const float* m) {
+ auto X = mad(x,m[0], mad(y,m[2], m[4])),
+ Y = mad(x,m[1], mad(y,m[3], m[5]));
+ x = X;
+ y = Y;
+}
+
+STAGE_PP(uniform_color, const SkJumper_UniformColorCtx* c) {
r = c->rgba[0];
g = c->rgba[1];
b = c->rgba[2];
a = c->rgba[3];
}
-STAGE(black_color, Ctx::None) { r = g = b = 0; a = 255; }
-STAGE(white_color, Ctx::None) { r = g = b = 255; a = 255; }
+STAGE_PP(black_color, Ctx::None) { r = g = b = 0; a = 255; }
+STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; }
-STAGE(set_rgb, const float rgb[3]) {
+STAGE_PP(set_rgb, const float rgb[3]) {
r = from_float(rgb[0]);
g = from_float(rgb[1]);
b = from_float(rgb[2]);
}
-STAGE(premul, Ctx::None) {
+STAGE_PP(premul, Ctx::None) {
r = div255(r * a);
g = div255(g * a);
b = div255(b * a);
}
-STAGE(swap_rb, Ctx::None) {
+STAGE_PP(swap_rb, Ctx::None) {
auto tmp = r;
r = b;
b = tmp;
}
-STAGE(move_src_dst, Ctx::None) {
+STAGE_PP(move_src_dst, Ctx::None) {
dr = r;
dg = g;
db = b;
da = a;
}
-STAGE(move_dst_src, Ctx::None) {
+STAGE_PP(move_dst_src, Ctx::None) {
r = dr;
g = dg;
b = db;
a = da;
}
-STAGE(invert, Ctx::None) {
+STAGE_PP(invert, Ctx::None) {
r = inv(r);
g = inv(g);
b = inv(b);
@@ -191,7 +259,7 @@ STAGE(invert, Ctx::None) {
// The same logic applied to all 4 channels.
#define BLEND_MODE(name) \
SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
- STAGE(name, Ctx::None) { \
+ STAGE_PP(name, Ctx::None) { \
r = name##_channel(r,dr,a,da); \
g = name##_channel(g,dg,a,da); \
b = name##_channel(b,db,a,da); \
@@ -218,7 +286,7 @@ STAGE(invert, Ctx::None) {
// The same logic applied to color, and srcover for alpha.
#define BLEND_MODE(name) \
SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
- STAGE(name, Ctx::None) { \
+ STAGE_PP(name, Ctx::None) { \
r = name##_channel(r,dr,a,da); \
g = name##_channel(g,dg,a,da); \
b = name##_channel(b,db,a,da); \
@@ -248,6 +316,19 @@ SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, size_t dx, size_t dy) {
return (T*)ctx->pixels + dy*ctx->stride + dx;
}
+template <typename T>
+SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) {
+ auto clamp = [](F v, F limit) {
+ limit = bit_cast<F>( bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive.
+ return min(max(0, v), limit);
+ };
+ x = clamp(x, ctx->width);
+ y = clamp(y, ctx->height);
+
+ *ptr = (const T*)ctx->pixels;
+ return trunc_(y)*ctx->stride + trunc_(x);
+}
+
template <typename V, typename T>
SI V load(const T* ptr, size_t tail) {
V v = 0;
@@ -297,8 +378,48 @@ SI void store(T* ptr, size_t tail, V v) {
}
}
+template <typename V, typename T>
+SI V gather(const T* ptr, U32 ix) {
+#if defined(__AVX2__)
+ return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
+ ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
+ ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
+ ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
+#else
+ return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
+ ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
+#endif
+}
+// TODO: AVX2 gather instructions where possible
+
+
// ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
+SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
+#if 1 && defined(__AVX2__)
+ // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
+ __m256i _01,_23;
+ split(rgba, &_01, &_23);
+ __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20),
+ _13 = _mm256_permute2x128_si256(_01,_23, 0x31);
+ rgba = join<U32>(_02, _13);
+
+ auto cast_U16 = [](U32 v) -> U16 {
+ __m256i _02,_13;
+ split(v, &_02,&_13);
+ return _mm256_packus_epi32(_02,_13);
+ };
+#else
+ auto cast_U16 = [](U32 v) -> U16 {
+ return cast<U16>(v);
+ };
+#endif
+ *r = cast_U16(rgba & 65535) & 255;
+ *g = cast_U16(rgba & 65535) >> 8;
+ *b = cast_U16(rgba >> 16) & 255;
+ *a = cast_U16(rgba >> 16) >> 8;
+}
+
SI void load_8888(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
#if 1 && defined(__ARM_NEON)
uint8x8x4_t rgba;
@@ -316,32 +437,8 @@ SI void load_8888(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16*
*g = cast<U16>(rgba.val[1]);
*b = cast<U16>(rgba.val[2]);
*a = cast<U16>(rgba.val[3]);
-#elif 1 && defined(__AVX2__)
- // Load normally.
- U32 rgba = load<U32>(ptr, tail);
-
- // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
- __m256i _01,_23;
- split(rgba, &_01, &_23);
- __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20),
- _13 = _mm256_permute2x128_si256(_01,_23, 0x31);
- rgba = join<U32>(_02, _13);
-
- auto cast_U16 = [](U32 v) -> U16 {
- __m256i _02,_13;
- split(v, &_02,&_13);
- return _mm256_packus_epi32(_02,_13);
- };
- *r = cast_U16(rgba & 65535) & 255;
- *g = cast_U16(rgba & 65535) >> 8;
- *b = cast_U16(rgba >> 16) & 255;
- *a = cast_U16(rgba >> 16) >> 8;
#else
- U32 rgba = load<U32>(ptr, tail);
- *r = cast<U16>(rgba & 65535) & 255;
- *g = cast<U16>(rgba & 65535) >> 8;
- *b = cast<U16>(rgba >> 16) & 255;
- *a = cast<U16>(rgba >> 16) >> 8;
+ from_8888(load<U32>(ptr, tail), r,g,b,a);
#endif
}
SI void store_8888(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
@@ -368,26 +465,32 @@ SI void store_8888(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
#endif
}
-STAGE(load_8888, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(load_8888, const SkJumper_MemoryCtx* ctx) {
load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
}
-STAGE(load_8888_dst, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(load_8888_dst, const SkJumper_MemoryCtx* ctx) {
load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
}
-STAGE(store_8888, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(store_8888, const SkJumper_MemoryCtx* ctx) {
store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a);
}
-STAGE(load_bgra, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(load_bgra, const SkJumper_MemoryCtx* ctx) {
load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &b,&g,&r,&a);
}
-STAGE(load_bgra_dst, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(load_bgra_dst, const SkJumper_MemoryCtx* ctx) {
load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &db,&dg,&dr,&da);
}
-STAGE(store_bgra, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(store_bgra, const SkJumper_MemoryCtx* ctx) {
store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, b,g,r,a);
}
+STAGE_GP(gather_8888, const SkJumper_GatherCtx* ctx) {
+ const uint32_t* ptr;
+ U32 ix = ix_and_ptr(&ptr, ctx, x,y);
+ from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
+}
+
// ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
SI void load_565(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
@@ -413,15 +516,15 @@ SI void store_565(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) {
| B << 0);
}
-STAGE(load_565, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(load_565, const SkJumper_MemoryCtx* ctx) {
load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b);
a = 255;
}
-STAGE(load_565_dst, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(load_565_dst, const SkJumper_MemoryCtx* ctx) {
load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db);
da = 255;
}
-STAGE(store_565, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(store_565, const SkJumper_MemoryCtx* ctx) {
store_565(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b);
}
@@ -434,41 +537,41 @@ SI void store_8(uint8_t* ptr, size_t tail, U16 v) {
store(ptr, tail, cast<U8>(v));
}
-STAGE(load_a8, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(load_a8, const SkJumper_MemoryCtx* ctx) {
r = g = b = 0;
a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
}
-STAGE(load_a8_dst, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(load_a8_dst, const SkJumper_MemoryCtx* ctx) {
dr = dg = db = 0;
da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
}
-STAGE(store_a8, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(store_a8, const SkJumper_MemoryCtx* ctx) {
store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a);
}
-STAGE(load_g8, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(load_g8, const SkJumper_MemoryCtx* ctx) {
r = g = b = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
a = 255;
}
-STAGE(load_g8_dst, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(load_g8_dst, const SkJumper_MemoryCtx* ctx) {
dr = dg = db = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
da = 255;
}
-STAGE(luminance_to_alpha, Ctx::None) {
+STAGE_PP(luminance_to_alpha, Ctx::None) {
a = (r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator.
r = g = b = 0;
}
// ~~~~~~ Coverage scales / lerps ~~~~~~ //
-STAGE(scale_1_float, const float* f) {
+STAGE_PP(scale_1_float, const float* f) {
U16 c = from_float(*f);
r = div255( r * c );
g = div255( g * c );
b = div255( b * c );
a = div255( a * c );
}
-STAGE(lerp_1_float, const float* f) {
+STAGE_PP(lerp_1_float, const float* f) {
U16 c = from_float(*f);
r = lerp(dr, r, c);
g = lerp(dg, g, c);
@@ -476,14 +579,14 @@ STAGE(lerp_1_float, const float* f) {
a = lerp(da, a, c);
}
-STAGE(scale_u8, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(scale_u8, const SkJumper_MemoryCtx* ctx) {
U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
r = div255( r * c );
g = div255( g * c );
b = div255( b * c );
a = div255( a * c );
}
-STAGE(lerp_u8, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(lerp_u8, const SkJumper_MemoryCtx* ctx) {
U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
r = lerp(dr, r, c);
g = lerp(dg, g, c);
@@ -496,7 +599,7 @@ SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) {
return if_then_else(a < da, min(cr,cg,cb)
, max(cr,cg,cb));
}
-STAGE(scale_565, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(scale_565, const SkJumper_MemoryCtx* ctx) {
U16 cr,cg,cb;
load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
@@ -506,7 +609,7 @@ STAGE(scale_565, const SkJumper_MemoryCtx* ctx) {
b = div255( b * cb );
a = div255( a * ca );
}
-STAGE(lerp_565, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(lerp_565, const SkJumper_MemoryCtx* ctx) {
U16 cr,cg,cb;
load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
@@ -519,7 +622,7 @@ STAGE(lerp_565, const SkJumper_MemoryCtx* ctx) {
// ~~~~~~ Compound stages ~~~~~~ //
-STAGE(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) {
+STAGE_PP(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) {
auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
load_8888(ptr, tail, &dr,&dg,&db,&da);