From a46623b2f3bbf09822537c63a1d64f2e245c6ff0 Mon Sep 17 00:00:00 2001 From: Mike Klein Date: Sat, 10 Mar 2018 10:27:24 -0500 Subject: follow JUMPER_NARROW_STAGES in lowp stages too Should give dramatically better codegen for all 32-bit builds and 64-bit Windows builds, bringing it in line with how we make highp float stages. May help this bug, which is mostly Windows perf regressions. Bug: chromium:820469 Change-Id: I223f7568a09dea28ec614b18555766ea7d8365fa Reviewed-on: https://skia-review.googlesource.com/113665 Reviewed-by: Herb Derby Commit-Queue: Mike Klein --- src/opts/SkRasterPipeline_opts.h | 179 +++++++++++++++++++++++++++------------ 1 file changed, 124 insertions(+), 55 deletions(-) (limited to 'src/opts') diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h index 634d0fbccc..edc50732f3 100644 --- a/src/opts/SkRasterPipeline_opts.h +++ b/src/opts/SkRasterPipeline_opts.h @@ -704,7 +704,7 @@ static const size_t N = sizeof(F) / sizeof(float); #if defined(__ARM_NEON) && defined(__arm__) // This lets us pass vectors more efficiently on 32-bit ARM. #define ABI __attribute__((pcs("aapcs-vfp"))) -#elif defined(__clang__) && defined(_MSC_VER) +#elif 0 || defined(__clang__) && defined(_MSC_VER) // TODO: can we use sysv_abi here instead? It'd allow passing far more registers. #define ABI __attribute__((vectorcall)) #else @@ -718,7 +718,7 @@ static const size_t N = sizeof(F) / sizeof(float); // On ARMv7, we do the same so that we can make the r,g,b,a vectors wider. // // Finally, this narrower stage calling convention also fits Windows' __vectorcall very well. -#if defined(__i386__) || defined(_M_IX86) || defined(__arm__) || defined(_MSC_VER) +#if 0 || defined(__i386__) || defined(_M_IX86) || defined(__arm__) || defined(_MSC_VER) #define JUMPER_NARROW_STAGES 1 #else #define JUMPER_NARROW_STAGES 0 @@ -2329,17 +2329,36 @@ namespace lowp { static const size_t N = sizeof(U16) / sizeof(uint16_t); -// TODO: follow the guidance of JUMPER_NARROW_STAGES for lowp stages too. - -// We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64. -using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy, - U16 r, U16 g, U16 b, U16 a, - U16 dr, U16 dg, U16 db, U16 da); +// Once again, some platforms benefit from a restricted Stage calling convention, +// but others can pass tons and tons of registers and we're happy to exploit that. +// It's exactly the same decision and implementation strategy as the F stages above. +#if JUMPER_NARROW_STAGES + struct Params { + size_t dx, dy, tail; + U16 dr,dg,db,da; + }; + using Stage = void(ABI*)(Params*, void** program, U16 r, U16 g, U16 b, U16 a); +#else + // We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64. + using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy, + U16 r, U16 g, U16 b, U16 a, + U16 dr, U16 dg, U16 db, U16 da); +#endif static void start_pipeline(const size_t x0, const size_t y0, const size_t xlimit, const size_t ylimit, void** program) { auto start = (Stage)load_and_inc(program); for (size_t dy = y0; dy < ylimit; dy++) { + #if JUMPER_NARROW_STAGES + Params params = { x0,dy,0, 0,0,0,0 }; + for (; params.dx + N <= xlimit; params.dx += N) { + start(¶ms,program, 0,0,0,0); + } + if (size_t tail = xlimit - params.dx) { + params.tail = tail; + start(¶ms,program, 0,0,0,0); + } + #else size_t dx = x0; for (; dx + N <= xlimit; dx += N) { start( 0,program,dx,dy, 0,0,0,0, 0,0,0,0); @@ -2347,10 +2366,15 @@ static void start_pipeline(const size_t x0, const size_t y0, if (size_t tail = xlimit - dx) { start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0); } + #endif } } -static ABI void just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {} +#if JUMPER_NARROW_STAGES + static ABI void just_return(Params*, void**, U16,U16,U16,U16) {} +#else + static ABI void just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {} +#endif // All stages use the same function call ABI to chain into each other, but there are three types: // GG: geometry in, geometry out -- think, a matrix @@ -2362,52 +2386,97 @@ static ABI void just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U1 // These three STAGE_ macros let you define each type of stage, // and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate. -#define STAGE_GG(name, ...) \ - SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \ - static ABI void name(size_t tail, void** program, size_t dx, size_t dy, \ - U16 r, U16 g, U16 b, U16 a, \ - U16 dr, U16 dg, U16 db, U16 da) { \ - auto x = join(r,g), \ - y = join(b,a); \ - name##_k(Ctx{program}, dx,dy,tail, x,y); \ - split(x, &r,&g); \ - split(y, &b,&a); \ - auto next = (Stage)load_and_inc(program); \ - next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ - } \ - SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y) - -#define STAGE_GP(name, ...) \ - SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ - U16& r, U16& g, U16& b, U16& a, \ - U16& dr, U16& dg, U16& db, U16& da); \ - static ABI void name(size_t tail, void** program, size_t dx, size_t dy, \ - U16 r, U16 g, U16 b, U16 a, \ - U16 dr, U16 dg, U16 db, U16 da) { \ - auto x = join(r,g), \ - y = join(b,a); \ - name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \ - auto next = (Stage)load_and_inc(program); \ - next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ - } \ - SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ - U16& r, U16& g, U16& b, U16& a, \ - U16& dr, U16& dg, U16& db, U16& da) - -#define STAGE_PP(name, ...) \ - SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ - U16& r, U16& g, U16& b, U16& a, \ - U16& dr, U16& dg, U16& db, U16& da); \ - static ABI void name(size_t tail, void** program, size_t dx, size_t dy, \ - U16 r, U16 g, U16 b, U16 a, \ - U16 dr, U16 dg, U16 db, U16 da) { \ - name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \ - auto next = (Stage)load_and_inc(program); \ - next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ - } \ - SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ - U16& r, U16& g, U16& b, U16& a, \ - U16& dr, U16& dg, U16& db, U16& da) +#if JUMPER_NARROW_STAGES + #define STAGE_GG(name, ...) \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \ + static ABI void name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \ + auto x = join(r,g), \ + y = join(b,a); \ + name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y); \ + split(x, &r,&g); \ + split(y, &b,&a); \ + auto next = (Stage)load_and_inc(program); \ + next(params,program, r,g,b,a); \ + } \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y) + + #define STAGE_GP(name, ...) \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ + U16& r, U16& g, U16& b, U16& a, \ + U16& dr, U16& dg, U16& db, U16& da); \ + static ABI void name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \ + auto x = join(r,g), \ + y = join(b,a); \ + name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a, \ + params->dr,params->dg,params->db,params->da); \ + auto next = (Stage)load_and_inc(program); \ + next(params,program, r,g,b,a); \ + } \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ + U16& r, U16& g, U16& b, U16& a, \ + U16& dr, U16& dg, U16& db, U16& da) + + #define STAGE_PP(name, ...) \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ + U16& r, U16& g, U16& b, U16& a, \ + U16& dr, U16& dg, U16& db, U16& da); \ + static ABI void name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \ + name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a, \ + params->dr,params->dg,params->db,params->da); \ + auto next = (Stage)load_and_inc(program); \ + next(params,program, r,g,b,a); \ + } \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ + U16& r, U16& g, U16& b, U16& a, \ + U16& dr, U16& dg, U16& db, U16& da) +#else + #define STAGE_GG(name, ...) \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \ + static ABI void name(size_t tail, void** program, size_t dx, size_t dy, \ + U16 r, U16 g, U16 b, U16 a, \ + U16 dr, U16 dg, U16 db, U16 da) { \ + auto x = join(r,g), \ + y = join(b,a); \ + name##_k(Ctx{program}, dx,dy,tail, x,y); \ + split(x, &r,&g); \ + split(y, &b,&a); \ + auto next = (Stage)load_and_inc(program); \ + next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ + } \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y) + + #define STAGE_GP(name, ...) \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ + U16& r, U16& g, U16& b, U16& a, \ + U16& dr, U16& dg, U16& db, U16& da); \ + static ABI void name(size_t tail, void** program, size_t dx, size_t dy, \ + U16 r, U16 g, U16 b, U16 a, \ + U16 dr, U16 dg, U16 db, U16 da) { \ + auto x = join(r,g), \ + y = join(b,a); \ + name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \ + auto next = (Stage)load_and_inc(program); \ + next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ + } \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ + U16& r, U16& g, U16& b, U16& a, \ + U16& dr, U16& dg, U16& db, U16& da) + + #define STAGE_PP(name, ...) \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ + U16& r, U16& g, U16& b, U16& a, \ + U16& dr, U16& dg, U16& db, U16& da); \ + static ABI void name(size_t tail, void** program, size_t dx, size_t dy, \ + U16 r, U16 g, U16 b, U16 a, \ + U16 dr, U16 dg, U16 db, U16 da) { \ + name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \ + auto next = (Stage)load_and_inc(program); \ + next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ + } \ + SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ + U16& r, U16& g, U16& b, U16& a, \ + U16& dr, U16& dg, U16& db, U16& da) +#endif // ~~~~~~ Commonly used helper functions ~~~~~~ // -- cgit v1.2.3