diff options
Diffstat (limited to 'src/jumper/SkJumper_stages_lowp.cpp')
-rw-r--r-- | src/jumper/SkJumper_stages_lowp.cpp | 221 |
1 files changed, 162 insertions, 59 deletions
diff --git a/src/jumper/SkJumper_stages_lowp.cpp b/src/jumper/SkJumper_stages_lowp.cpp index 79e3fbd0d7..247bdf6a2b 100644 --- a/src/jumper/SkJumper_stages_lowp.cpp +++ b/src/jumper/SkJumper_stages_lowp.cpp @@ -42,12 +42,16 @@ using U8 = uint8_t __attribute__((ext_vector_type(16))); using U16 = uint16_t __attribute__((ext_vector_type(16))); using I16 = int16_t __attribute__((ext_vector_type(16))); + using I32 = int32_t __attribute__((ext_vector_type(16))); using U32 = uint32_t __attribute__((ext_vector_type(16))); + using F = float __attribute__((ext_vector_type(16))); #else using U8 = uint8_t __attribute__((ext_vector_type(8))); using U16 = uint16_t __attribute__((ext_vector_type(8))); using I16 = int16_t __attribute__((ext_vector_type(8))); + using I32 = int32_t __attribute__((ext_vector_type(8))); using U32 = uint32_t __attribute__((ext_vector_type(8))); + using F = float __attribute__((ext_vector_type(8))); #endif static const size_t N = sizeof(U16) / sizeof(uint16_t); @@ -78,7 +82,49 @@ ABI extern "C" void WRAP(start_pipeline)(const size_t x0, ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {} -#define STAGE(name, ...) \ +// All stages use the same function call ABI to chain into each other, but there are three types: +// GG: geometry in, geometry out -- think, a matrix +// GP: geometry in, pixels out. -- think, a memory gather +// PP: pixels in, pixels out. -- think, a blend mode +// +// (Some stages ignore their inputs or produce no logical output. That's perfectly fine.) +// +// These three STAGE_ macros let you define each type of stage, +// and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate. + +#define STAGE_GG(name, ...) \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \ + ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \ + U16 r, U16 g, U16 b, U16 a, \ + U16 dr, U16 dg, U16 db, U16 da) { \ + auto x = join<F>(r,g), \ + y = join<F>(b,a); \ + name##_k(Ctx{program}, dx,dy,tail, x,y); \ + split(x, &r,&g); \ + split(y, &b,&a); \ + auto next = (Stage)load_and_inc(program); \ + next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ + } \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y) + +#define STAGE_GP(name, ...) \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ + U16& r, U16& g, U16& b, U16& a, \ + U16& dr, U16& dg, U16& db, U16& da); \ + ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \ + U16 r, U16 g, U16 b, U16 a, \ + U16 dr, U16 dg, U16 db, U16 da) { \ + auto x = join<F>(r,g), \ + y = join<F>(b,a); \ + name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \ + auto next = (Stage)load_and_inc(program); \ + next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ + } \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ + U16& r, U16& g, U16& b, U16& a, \ + U16& dr, U16& dg, U16& db, U16& da) + +#define STAGE_PP(name, ...) \ SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ U16& r, U16& g, U16& b, U16& a, \ U16& dr, U16& dg, U16& db, U16& da); \ @@ -136,50 +182,72 @@ SI D join(S lo, S hi) { return v; } +// TODO: do we need platform-specific intrinsics for any of these? +SI F if_then_else(I32 c, F t, F e) { + return bit_cast<F>( (bit_cast<I32>(t) & c) | (bit_cast<I32>(e) & ~c) ); +} +SI F max(F x, F y) { return if_then_else(x < y, y, x); } +SI F min(F x, F y) { return if_then_else(x < y, x, y); } + +SI F mad(F f, F m, F a) { return f*m+a; } +SI U32 trunc_(F x) { return (U32)cast<I32>(x); } + // ~~~~~~ Basic / misc. stages ~~~~~~ // -STAGE(uniform_color, const SkJumper_UniformColorCtx* c) { +STAGE_GG(seed_shader, const float* iota) { + x = cast<F>(I32(dx)) + unaligned_load<F>(iota); + y = cast<F>(I32(dy)) + 0.5f; +} + +STAGE_GG(matrix_2x3, const float* m) { + auto X = mad(x,m[0], mad(y,m[2], m[4])), + Y = mad(x,m[1], mad(y,m[3], m[5])); + x = X; + y = Y; +} + +STAGE_PP(uniform_color, const SkJumper_UniformColorCtx* c) { r = c->rgba[0]; g = c->rgba[1]; b = c->rgba[2]; a = c->rgba[3]; } -STAGE(black_color, Ctx::None) { r = g = b = 0; a = 255; } -STAGE(white_color, Ctx::None) { r = g = b = 255; a = 255; } +STAGE_PP(black_color, Ctx::None) { r = g = b = 0; a = 255; } +STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; } -STAGE(set_rgb, const float rgb[3]) { +STAGE_PP(set_rgb, const float rgb[3]) { r = from_float(rgb[0]); g = from_float(rgb[1]); b = from_float(rgb[2]); } -STAGE(premul, Ctx::None) { +STAGE_PP(premul, Ctx::None) { r = div255(r * a); g = div255(g * a); b = div255(b * a); } -STAGE(swap_rb, Ctx::None) { +STAGE_PP(swap_rb, Ctx::None) { auto tmp = r; r = b; b = tmp; } -STAGE(move_src_dst, Ctx::None) { +STAGE_PP(move_src_dst, Ctx::None) { dr = r; dg = g; db = b; da = a; } -STAGE(move_dst_src, Ctx::None) { +STAGE_PP(move_dst_src, Ctx::None) { r = dr; g = dg; b = db; a = da; } -STAGE(invert, Ctx::None) { +STAGE_PP(invert, Ctx::None) { r = inv(r); g = inv(g); b = inv(b); @@ -191,7 +259,7 @@ STAGE(invert, Ctx::None) { // The same logic applied to all 4 channels. #define BLEND_MODE(name) \ SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ - STAGE(name, Ctx::None) { \ + STAGE_PP(name, Ctx::None) { \ r = name##_channel(r,dr,a,da); \ g = name##_channel(g,dg,a,da); \ b = name##_channel(b,db,a,da); \ @@ -218,7 +286,7 @@ STAGE(invert, Ctx::None) { // The same logic applied to color, and srcover for alpha. #define BLEND_MODE(name) \ SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ - STAGE(name, Ctx::None) { \ + STAGE_PP(name, Ctx::None) { \ r = name##_channel(r,dr,a,da); \ g = name##_channel(g,dg,a,da); \ b = name##_channel(b,db,a,da); \ @@ -248,6 +316,19 @@ SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, size_t dx, size_t dy) { return (T*)ctx->pixels + dy*ctx->stride + dx; } +template <typename T> +SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) { + auto clamp = [](F v, F limit) { + limit = bit_cast<F>( bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive. + return min(max(0, v), limit); + }; + x = clamp(x, ctx->width); + y = clamp(y, ctx->height); + + *ptr = (const T*)ctx->pixels; + return trunc_(y)*ctx->stride + trunc_(x); +} + template <typename V, typename T> SI V load(const T* ptr, size_t tail) { V v = 0; @@ -297,8 +378,48 @@ SI void store(T* ptr, size_t tail, V v) { } } +template <typename V, typename T> +SI V gather(const T* ptr, U32 ix) { +#if defined(__AVX2__) + return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], + ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], + ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]], + ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], }; +#else + return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], + ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], }; +#endif +} +// TODO: AVX2 gather instructions where possible + + // ~~~~~~ 32-bit memory loads and stores ~~~~~~ // +SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) { +#if 1 && defined(__AVX2__) + // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely. + __m256i _01,_23; + split(rgba, &_01, &_23); + __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20), + _13 = _mm256_permute2x128_si256(_01,_23, 0x31); + rgba = join<U32>(_02, _13); + + auto cast_U16 = [](U32 v) -> U16 { + __m256i _02,_13; + split(v, &_02,&_13); + return _mm256_packus_epi32(_02,_13); + }; +#else + auto cast_U16 = [](U32 v) -> U16 { + return cast<U16>(v); + }; +#endif + *r = cast_U16(rgba & 65535) & 255; + *g = cast_U16(rgba & 65535) >> 8; + *b = cast_U16(rgba >> 16) & 255; + *a = cast_U16(rgba >> 16) >> 8; +} + SI void load_8888(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { #if 1 && defined(__ARM_NEON) uint8x8x4_t rgba; @@ -316,32 +437,8 @@ SI void load_8888(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* *g = cast<U16>(rgba.val[1]); *b = cast<U16>(rgba.val[2]); *a = cast<U16>(rgba.val[3]); -#elif 1 && defined(__AVX2__) - // Load normally. - U32 rgba = load<U32>(ptr, tail); - - // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely. - __m256i _01,_23; - split(rgba, &_01, &_23); - __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20), - _13 = _mm256_permute2x128_si256(_01,_23, 0x31); - rgba = join<U32>(_02, _13); - - auto cast_U16 = [](U32 v) -> U16 { - __m256i _02,_13; - split(v, &_02,&_13); - return _mm256_packus_epi32(_02,_13); - }; - *r = cast_U16(rgba & 65535) & 255; - *g = cast_U16(rgba & 65535) >> 8; - *b = cast_U16(rgba >> 16) & 255; - *a = cast_U16(rgba >> 16) >> 8; #else - U32 rgba = load<U32>(ptr, tail); - *r = cast<U16>(rgba & 65535) & 255; - *g = cast<U16>(rgba & 65535) >> 8; - *b = cast<U16>(rgba >> 16) & 255; - *a = cast<U16>(rgba >> 16) >> 8; + from_8888(load<U32>(ptr, tail), r,g,b,a); #endif } SI void store_8888(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { @@ -368,26 +465,32 @@ SI void store_8888(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { #endif } -STAGE(load_8888, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(load_8888, const SkJumper_MemoryCtx* ctx) { load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a); } -STAGE(load_8888_dst, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(load_8888_dst, const SkJumper_MemoryCtx* ctx) { load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da); } -STAGE(store_8888, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(store_8888, const SkJumper_MemoryCtx* ctx) { store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a); } -STAGE(load_bgra, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(load_bgra, const SkJumper_MemoryCtx* ctx) { load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &b,&g,&r,&a); } -STAGE(load_bgra_dst, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(load_bgra_dst, const SkJumper_MemoryCtx* ctx) { load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &db,&dg,&dr,&da); } -STAGE(store_bgra, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(store_bgra, const SkJumper_MemoryCtx* ctx) { store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, b,g,r,a); } +STAGE_GP(gather_8888, const SkJumper_GatherCtx* ctx) { + const uint32_t* ptr; + U32 ix = ix_and_ptr(&ptr, ctx, x,y); + from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a); +} + // ~~~~~~ 16-bit memory loads and stores ~~~~~~ // SI void load_565(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { @@ -413,15 +516,15 @@ SI void store_565(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) { | B << 0); } -STAGE(load_565, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(load_565, const SkJumper_MemoryCtx* ctx) { load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b); a = 255; } -STAGE(load_565_dst, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(load_565_dst, const SkJumper_MemoryCtx* ctx) { load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db); da = 255; } -STAGE(store_565, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(store_565, const SkJumper_MemoryCtx* ctx) { store_565(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b); } @@ -434,41 +537,41 @@ SI void store_8(uint8_t* ptr, size_t tail, U16 v) { store(ptr, tail, cast<U8>(v)); } -STAGE(load_a8, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(load_a8, const SkJumper_MemoryCtx* ctx) { r = g = b = 0; a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); } -STAGE(load_a8_dst, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(load_a8_dst, const SkJumper_MemoryCtx* ctx) { dr = dg = db = 0; da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); } -STAGE(store_a8, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(store_a8, const SkJumper_MemoryCtx* ctx) { store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a); } -STAGE(load_g8, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(load_g8, const SkJumper_MemoryCtx* ctx) { r = g = b = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); a = 255; } -STAGE(load_g8_dst, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(load_g8_dst, const SkJumper_MemoryCtx* ctx) { dr = dg = db = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); da = 255; } -STAGE(luminance_to_alpha, Ctx::None) { +STAGE_PP(luminance_to_alpha, Ctx::None) { a = (r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator. r = g = b = 0; } // ~~~~~~ Coverage scales / lerps ~~~~~~ // -STAGE(scale_1_float, const float* f) { +STAGE_PP(scale_1_float, const float* f) { U16 c = from_float(*f); r = div255( r * c ); g = div255( g * c ); b = div255( b * c ); a = div255( a * c ); } -STAGE(lerp_1_float, const float* f) { +STAGE_PP(lerp_1_float, const float* f) { U16 c = from_float(*f); r = lerp(dr, r, c); g = lerp(dg, g, c); @@ -476,14 +579,14 @@ STAGE(lerp_1_float, const float* f) { a = lerp(da, a, c); } -STAGE(scale_u8, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(scale_u8, const SkJumper_MemoryCtx* ctx) { U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); r = div255( r * c ); g = div255( g * c ); b = div255( b * c ); a = div255( a * c ); } -STAGE(lerp_u8, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(lerp_u8, const SkJumper_MemoryCtx* ctx) { U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); r = lerp(dr, r, c); g = lerp(dg, g, c); @@ -496,7 +599,7 @@ SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) { return if_then_else(a < da, min(cr,cg,cb) , max(cr,cg,cb)); } -STAGE(scale_565, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(scale_565, const SkJumper_MemoryCtx* ctx) { U16 cr,cg,cb; load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); @@ -506,7 +609,7 @@ STAGE(scale_565, const SkJumper_MemoryCtx* ctx) { b = div255( b * cb ); a = div255( a * ca ); } -STAGE(lerp_565, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(lerp_565, const SkJumper_MemoryCtx* ctx) { U16 cr,cg,cb; load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); @@ -519,7 +622,7 @@ STAGE(lerp_565, const SkJumper_MemoryCtx* ctx) { // ~~~~~~ Compound stages ~~~~~~ // -STAGE(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) { +STAGE_PP(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) { auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); load_8888(ptr, tail, &dr,&dg,&db,&da); |