diff options
Diffstat (limited to 'src/jumper/SkJumper_stages.cpp')
-rw-r--r-- | src/jumper/SkJumper_stages.cpp | 132 |
1 files changed, 66 insertions, 66 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index d192728419..1f9848e1fa 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -47,40 +47,40 @@ static const size_t N = sizeof(F) / sizeof(float); // General-purpose registers are also tight, so we put most of those on the stack too. // On ARMv7, we do the same so that we can make the r,g,b,a vectors wider. struct Params { - size_t x, y, tail; + size_t dx, dy, tail; F dr,dg,db,da; }; using Stage = void(Params*, void** program, F r, F g, F b, F a); #else // We keep program the second argument, so that it's passed in rsi for load_and_inc(). - using Stage = void(size_t tail, void** program, size_t x, size_t y, F,F,F,F, F,F,F,F); + using Stage = void(size_t tail, void** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F); #endif MAYBE_MSABI -extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit, +extern "C" void WRAP(start_pipeline)(size_t dx, size_t dy, size_t xlimit, size_t ylimit, void** program) { auto start = (Stage*)load_and_inc(program); - const size_t x0 = x; - for (; y < ylimit; y++) { + const size_t x0 = dx; + for (; dy < ylimit; dy++) { #if defined(__i386__) || defined(_M_IX86) || defined(__arm__) - Params params = { x0,y,0, 0,0,0,0 }; - while (params.x + N <= xlimit) { + Params params = { x0,dy,0, 0,0,0,0 }; + while (params.dx + N <= xlimit) { start(¶ms,program, 0,0,0,0); - params.x += N; + params.dx += N; } - if (size_t tail = xlimit - params.x) { + if (size_t tail = xlimit - params.dx) { params.tail = tail; start(¶ms,program, 0,0,0,0); } #else - x = x0; - while (x + N <= xlimit) { - start(0,program,x,y, 0,0,0,0, 0,0,0,0); - x += N; + dx = x0; + while (dx + N <= xlimit) { + start(0,program,dx,dy, 0,0,0,0, 0,0,0,0); + dx += N; } - if (size_t tail = xlimit - x) { - start(tail,program,x,y, 0,0,0,0, 0,0,0,0); + if (size_t tail = xlimit - dx) { + start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0); } #endif } @@ -88,28 +88,28 @@ extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t y #if defined(__i386__) || defined(_M_IX86) || defined(__arm__) #define STAGE(name, ...) \ - SI void name##_k(__VA_ARGS__, size_t x, size_t y, size_t tail, \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ extern "C" void WRAP(name)(Params* params, void** program, \ F r, F g, F b, F a) { \ - name##_k(Ctx{program},params->x,params->y,params->tail, r,g,b,a, \ + name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a, \ params->dr, params->dg, params->db, params->da); \ auto next = (Stage*)load_and_inc(program); \ next(params,program, r,g,b,a); \ } \ - SI void name##_k(__VA_ARGS__, size_t x, size_t y, size_t tail, \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) #else #define STAGE(name, ...) \ - SI void name##_k(__VA_ARGS__, size_t x, size_t y, size_t tail, \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ - extern "C" void WRAP(name)(size_t tail, void** program, size_t x, size_t y, \ + extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \ F r, F g, F b, F a, F dr, F dg, F db, F da) { \ - name##_k(Ctx{program},x,y,tail, r,g,b,a, dr,dg,db,da); \ + name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da); \ auto next = (Stage*)load_and_inc(program); \ - next(tail,program,x,y, r,g,b,a, dr,dg,db,da); \ + next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ } \ - SI void name##_k(__VA_ARGS__, size_t x, size_t y, size_t tail, \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) #endif @@ -192,10 +192,10 @@ SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) { *a = cast((_8888 >> 24) ) * (1/255.0f); } -// Used by load_ and store_ stages to get to the right (x,y) starting point of contiguous memory. +// Used by load_ and store_ stages to get to the right (dx,dy) starting point of contiguous memory. template <typename T> -SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int x, int y) { - return (T*)ctx->pixels + y*ctx->stride + x; +SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int dx, int dy) { + return (T*)ctx->pixels + dy*ctx->stride + dx; } // Used by gather_ stages to calculate the base pointer and a vector of indices to load. @@ -215,22 +215,22 @@ SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) { // Now finally, normal Stages! STAGE(seed_shader, Ctx::None) { - // It's important for speed to explicitly cast(x) and cast(y), + // It's important for speed to explicitly cast(dx) and cast(dy), // which has the effect of splatting them to vectors before converting to floats. // On Intel this breaks a data dependency on previous loop iterations' registers. float iota[] = { 0.5f,1.5f,2.5f,3.5f,4.5f,5.5f,6.5f,7.5f }; - r = cast(x) + unaligned_load<F>(iota); - g = cast(y) + 0.5f; + r = cast(dx) + unaligned_load<F>(iota); + g = cast(dy) + 0.5f; b = 1.0f; a = 0; dr = dg = db = da = 0; } STAGE(dither, const float* rate) { - // Get [(x,y), (x+1,y), (x+2,y), ...] loaded up in integer vectors. + // Get [(dx,dy), (dx+1,dy), (dx+2,dy), ...] loaded up in integer vectors. uint32_t iota[] = {0,1,2,3,4,5,6,7}; - U32 X = x + unaligned_load<U32>(iota), - Y = y; + U32 X = dx + unaligned_load<U32>(iota), + Y = dy; // We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering. // In this case n=8 and we're using the matrix that looks like 1/64 x [ 0 48 12 60 ... ]. @@ -480,7 +480,7 @@ STAGE(luminosity, Ctx::None) { } STAGE(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<uint32_t>(ctx, x,y); + auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); U32 dst = load<U32>(ptr, tail); dr = cast((dst ) & 0xff); @@ -679,7 +679,7 @@ STAGE(scale_1_float, const float* c) { a = a * *c; } STAGE(scale_u8, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); auto scales = load<U8>(ptr, tail); auto c = from_byte(scales); @@ -690,7 +690,7 @@ STAGE(scale_u8, const SkJumper_MemoryCtx* ctx) { a = a * c; } STAGE(scale_565, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint16_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); F cr,cg,cb; from_565(load<U16>(ptr, tail), &cr, &cg, &cb); @@ -714,7 +714,7 @@ STAGE(lerp_1_float, const float* c) { a = lerp(da, a, *c); } STAGE(lerp_u8, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); auto scales = load<U8>(ptr, tail); auto c = from_byte(scales); @@ -725,7 +725,7 @@ STAGE(lerp_u8, const SkJumper_MemoryCtx* ctx) { a = lerp(da, a, c); } STAGE(lerp_565, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint16_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); F cr,cg,cb; from_565(load<U16>(ptr, tail), &cr, &cg, &cb); @@ -739,14 +739,14 @@ STAGE(lerp_565, const SkJumper_MemoryCtx* ctx) { } STAGE(load_tables, const SkJumper_LoadTablesCtx* c) { - auto px = load<U32>((const uint32_t*)c->src + x, tail); + auto px = load<U32>((const uint32_t*)c->src + dx, tail); r = gather(c->r, (px ) & 0xff); g = gather(c->g, (px >> 8) & 0xff); b = gather(c->b, (px >> 16) & 0xff); a = cast( (px >> 24)) * (1/255.0f); } STAGE(load_tables_u16_be, const SkJumper_LoadTablesCtx* c) { - auto ptr = (const uint16_t*)c->src + 4*x; + auto ptr = (const uint16_t*)c->src + 4*dx; U16 R,G,B,A; load4(ptr, tail, &R,&G,&B,&A); @@ -758,7 +758,7 @@ STAGE(load_tables_u16_be, const SkJumper_LoadTablesCtx* c) { a = (1/65535.0f) * cast(expand(bswap(A))); } STAGE(load_tables_rgb_u16_be, const SkJumper_LoadTablesCtx* c) { - auto ptr = (const uint16_t*)c->src + 3*x; + auto ptr = (const uint16_t*)c->src + 3*dx; U16 R,G,B; load3(ptr, tail, &R,&G,&B); @@ -834,13 +834,13 @@ STAGE(lab_to_xyz, Ctx::None) { } STAGE(load_a8, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); r = g = b = 0.0f; a = from_byte(load<U8>(ptr, tail)); } STAGE(load_a8_dst, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); dr = dg = db = 0.0f; da = from_byte(load<U8>(ptr, tail)); @@ -852,20 +852,20 @@ STAGE(gather_a8, const SkJumper_GatherCtx* ctx) { a = from_byte(gather(ptr, ix)); } STAGE(store_a8, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<uint8_t>(ctx, x,y); + auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy); U8 packed = pack(pack(round(a, 255.0f))); store(ptr, packed, tail); } STAGE(load_g8, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); r = g = b = from_byte(load<U8>(ptr, tail)); a = 1.0f; } STAGE(load_g8_dst, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); dr = dg = db = from_byte(load<U8>(ptr, tail)); da = 1.0f; @@ -878,13 +878,13 @@ STAGE(gather_g8, const SkJumper_GatherCtx* ctx) { } STAGE(load_565, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint16_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); from_565(load<U16>(ptr, tail), &r,&g,&b); a = 1.0f; } STAGE(load_565_dst, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint16_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); from_565(load<U16>(ptr, tail), &dr,&dg,&db); da = 1.0f; @@ -896,7 +896,7 @@ STAGE(gather_565, const SkJumper_GatherCtx* ctx) { a = 1.0f; } STAGE(store_565, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<uint16_t>(ctx, x,y); + auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); U16 px = pack( round(r, 31.0f) << 11 | round(g, 63.0f) << 5 @@ -905,11 +905,11 @@ STAGE(store_565, const SkJumper_MemoryCtx* ctx) { } STAGE(load_4444, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint16_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); from_4444(load<U16>(ptr, tail), &r,&g,&b,&a); } STAGE(load_4444_dst, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint16_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); from_4444(load<U16>(ptr, tail), &dr,&dg,&db,&da); } STAGE(gather_4444, const SkJumper_GatherCtx* ctx) { @@ -918,7 +918,7 @@ STAGE(gather_4444, const SkJumper_GatherCtx* ctx) { from_4444(gather(ptr, ix), &r,&g,&b,&a); } STAGE(store_4444, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<uint16_t>(ctx, x,y); + auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); U16 px = pack( round(r, 15.0f) << 12 | round(g, 15.0f) << 8 | round(b, 15.0f) << 4 @@ -927,11 +927,11 @@ STAGE(store_4444, const SkJumper_MemoryCtx* ctx) { } STAGE(load_8888, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint32_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); from_8888(load<U32>(ptr, tail), &r,&g,&b,&a); } STAGE(load_8888_dst, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint32_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da); } STAGE(gather_8888, const SkJumper_GatherCtx* ctx) { @@ -940,7 +940,7 @@ STAGE(gather_8888, const SkJumper_GatherCtx* ctx) { from_8888(gather(ptr, ix), &r,&g,&b,&a); } STAGE(store_8888, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<uint32_t>(ctx, x,y); + auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); U32 px = round(r, 255.0f) | round(g, 255.0f) << 8 @@ -950,11 +950,11 @@ STAGE(store_8888, const SkJumper_MemoryCtx* ctx) { } STAGE(load_bgra, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint32_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); from_8888(load<U32>(ptr, tail), &b,&g,&r,&a); } STAGE(load_bgra_dst, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint32_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); from_8888(load<U32>(ptr, tail), &db,&dg,&dr,&da); } STAGE(gather_bgra, const SkJumper_GatherCtx* ctx) { @@ -963,7 +963,7 @@ STAGE(gather_bgra, const SkJumper_GatherCtx* ctx) { from_8888(gather(ptr, ix), &b,&g,&r,&a); } STAGE(store_bgra, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<uint32_t>(ctx, x,y); + auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); U32 px = round(b, 255.0f) | round(g, 255.0f) << 8 @@ -973,7 +973,7 @@ STAGE(store_bgra, const SkJumper_MemoryCtx* ctx) { } STAGE(load_f16, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint64_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy); U16 R,G,B,A; load4((const uint16_t*)ptr,tail, &R,&G,&B,&A); @@ -983,7 +983,7 @@ STAGE(load_f16, const SkJumper_MemoryCtx* ctx) { a = from_half(A); } STAGE(load_f16_dst, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint64_t>(ctx, x,y); + auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy); U16 R,G,B,A; load4((const uint16_t*)ptr,tail, &R,&G,&B,&A); @@ -1005,7 +1005,7 @@ STAGE(gather_f16, const SkJumper_GatherCtx* ctx) { a = from_half(A); } STAGE(store_f16, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<uint64_t>(ctx, x,y); + auto ptr = ptr_at_xy<uint64_t>(ctx, dx,dy); store4((uint16_t*)ptr,tail, to_half(r) , to_half(g) , to_half(b) @@ -1013,7 +1013,7 @@ STAGE(store_f16, const SkJumper_MemoryCtx* ctx) { } STAGE(load_u16_be, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint16_t>(ctx, 4*x,y); + auto ptr = ptr_at_xy<const uint16_t>(ctx, 4*dx,dy); U16 R,G,B,A; load4(ptr,tail, &R,&G,&B,&A); @@ -1024,7 +1024,7 @@ STAGE(load_u16_be, const SkJumper_MemoryCtx* ctx) { a = (1/65535.0f) * cast(expand(bswap(A))); } STAGE(load_rgb_u16_be, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const uint16_t>(ctx, 3*x,y); + auto ptr = ptr_at_xy<const uint16_t>(ctx, 3*dx,dy); U16 R,G,B; load3(ptr,tail, &R,&G,&B); @@ -1035,7 +1035,7 @@ STAGE(load_rgb_u16_be, const SkJumper_MemoryCtx* ctx) { a = 1.0f; } STAGE(store_u16_be, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<uint16_t>(ctx, 4*x,y); + auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,dy); U16 R = bswap(pack(round(r, 65535.0f))), G = bswap(pack(round(g, 65535.0f))), @@ -1046,15 +1046,15 @@ STAGE(store_u16_be, const SkJumper_MemoryCtx* ctx) { } STAGE(load_f32, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const float>(ctx, 4*x,y); + auto ptr = ptr_at_xy<const float>(ctx, 4*dx,dy); load4(ptr,tail, &r,&g,&b,&a); } STAGE(load_f32_dst, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<const float>(ctx, 4*x,y); + auto ptr = ptr_at_xy<const float>(ctx, 4*dx,dy); load4(ptr,tail, &dr,&dg,&db,&da); } STAGE(store_f32, const SkJumper_MemoryCtx* ctx) { - auto ptr = ptr_at_xy<float>(ctx, 4*x,y); + auto ptr = ptr_at_xy<float>(ctx, 4*dx,dy); store4(ptr,tail, r,g,b,a); } |