aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2018-03-10 10:27:24 -0500
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2018-03-10 16:07:18 +0000
commita46623b2f3bbf09822537c63a1d64f2e245c6ff0 (patch)
tree206fb473fb0be6eea8eb53740f648f5203a1644e /src/opts
parente3dc6a93466f73ccb5fba5364ec87df3aba4a1e2 (diff)
follow JUMPER_NARROW_STAGES in lowp stages too
Should give dramatically better codegen for all 32-bit builds and 64-bit Windows builds, bringing it in line with how we make highp float stages. May help this bug, which is mostly Windows perf regressions. Bug: chromium:820469 Change-Id: I223f7568a09dea28ec614b18555766ea7d8365fa Reviewed-on: https://skia-review.googlesource.com/113665 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/opts')
-rw-r--r--src/opts/SkRasterPipeline_opts.h179
1 files changed, 124 insertions, 55 deletions
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 634d0fbccc..edc50732f3 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -704,7 +704,7 @@ static const size_t N = sizeof(F) / sizeof(float);
#if defined(__ARM_NEON) && defined(__arm__)
// This lets us pass vectors more efficiently on 32-bit ARM.
#define ABI __attribute__((pcs("aapcs-vfp")))
-#elif defined(__clang__) && defined(_MSC_VER)
+#elif 0 || defined(__clang__) && defined(_MSC_VER)
// TODO: can we use sysv_abi here instead? It'd allow passing far more registers.
#define ABI __attribute__((vectorcall))
#else
@@ -718,7 +718,7 @@ static const size_t N = sizeof(F) / sizeof(float);
// On ARMv7, we do the same so that we can make the r,g,b,a vectors wider.
//
// Finally, this narrower stage calling convention also fits Windows' __vectorcall very well.
-#if defined(__i386__) || defined(_M_IX86) || defined(__arm__) || defined(_MSC_VER)
+#if 0 || defined(__i386__) || defined(_M_IX86) || defined(__arm__) || defined(_MSC_VER)
#define JUMPER_NARROW_STAGES 1
#else
#define JUMPER_NARROW_STAGES 0
@@ -2329,17 +2329,36 @@ namespace lowp {
static const size_t N = sizeof(U16) / sizeof(uint16_t);
-// TODO: follow the guidance of JUMPER_NARROW_STAGES for lowp stages too.
-
-// We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64.
-using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
- U16 r, U16 g, U16 b, U16 a,
- U16 dr, U16 dg, U16 db, U16 da);
+// Once again, some platforms benefit from a restricted Stage calling convention,
+// but others can pass tons and tons of registers and we're happy to exploit that.
+// It's exactly the same decision and implementation strategy as the F stages above.
+#if JUMPER_NARROW_STAGES
+ struct Params {
+ size_t dx, dy, tail;
+ U16 dr,dg,db,da;
+ };
+ using Stage = void(ABI*)(Params*, void** program, U16 r, U16 g, U16 b, U16 a);
+#else
+ // We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64.
+ using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
+ U16 r, U16 g, U16 b, U16 a,
+ U16 dr, U16 dg, U16 db, U16 da);
+#endif
static void start_pipeline(const size_t x0, const size_t y0,
const size_t xlimit, const size_t ylimit, void** program) {
auto start = (Stage)load_and_inc(program);
for (size_t dy = y0; dy < ylimit; dy++) {
+ #if JUMPER_NARROW_STAGES
+ Params params = { x0,dy,0, 0,0,0,0 };
+ for (; params.dx + N <= xlimit; params.dx += N) {
+ start(&params,program, 0,0,0,0);
+ }
+ if (size_t tail = xlimit - params.dx) {
+ params.tail = tail;
+ start(&params,program, 0,0,0,0);
+ }
+ #else
size_t dx = x0;
for (; dx + N <= xlimit; dx += N) {
start( 0,program,dx,dy, 0,0,0,0, 0,0,0,0);
@@ -2347,10 +2366,15 @@ static void start_pipeline(const size_t x0, const size_t y0,
if (size_t tail = xlimit - dx) {
start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
}
+ #endif
}
}
-static ABI void just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {}
+#if JUMPER_NARROW_STAGES
+ static ABI void just_return(Params*, void**, U16,U16,U16,U16) {}
+#else
+ static ABI void just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {}
+#endif
// All stages use the same function call ABI to chain into each other, but there are three types:
// GG: geometry in, geometry out -- think, a matrix
@@ -2362,52 +2386,97 @@ static ABI void just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U1
// These three STAGE_ macros let you define each type of stage,
// and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate.
-#define STAGE_GG(name, ...) \
- SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
- static ABI void name(size_t tail, void** program, size_t dx, size_t dy, \
- U16 r, U16 g, U16 b, U16 a, \
- U16 dr, U16 dg, U16 db, U16 da) { \
- auto x = join<F>(r,g), \
- y = join<F>(b,a); \
- name##_k(Ctx{program}, dx,dy,tail, x,y); \
- split(x, &r,&g); \
- split(y, &b,&a); \
- auto next = (Stage)load_and_inc(program); \
- next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
- } \
- SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
-
-#define STAGE_GP(name, ...) \
- SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
- U16& r, U16& g, U16& b, U16& a, \
- U16& dr, U16& dg, U16& db, U16& da); \
- static ABI void name(size_t tail, void** program, size_t dx, size_t dy, \
- U16 r, U16 g, U16 b, U16 a, \
- U16 dr, U16 dg, U16 db, U16 da) { \
- auto x = join<F>(r,g), \
- y = join<F>(b,a); \
- name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \
- auto next = (Stage)load_and_inc(program); \
- next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
- } \
- SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
- U16& r, U16& g, U16& b, U16& a, \
- U16& dr, U16& dg, U16& db, U16& da)
-
-#define STAGE_PP(name, ...) \
- SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
- U16& r, U16& g, U16& b, U16& a, \
- U16& dr, U16& dg, U16& db, U16& da); \
- static ABI void name(size_t tail, void** program, size_t dx, size_t dy, \
- U16 r, U16 g, U16 b, U16 a, \
- U16 dr, U16 dg, U16 db, U16 da) { \
- name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \
- auto next = (Stage)load_and_inc(program); \
- next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
- } \
- SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
- U16& r, U16& g, U16& b, U16& a, \
- U16& dr, U16& dg, U16& db, U16& da)
+#if JUMPER_NARROW_STAGES
+ #define STAGE_GG(name, ...) \
+ SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
+ static ABI void name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
+ auto x = join<F>(r,g), \
+ y = join<F>(b,a); \
+ name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y); \
+ split(x, &r,&g); \
+ split(y, &b,&a); \
+ auto next = (Stage)load_and_inc(program); \
+ next(params,program, r,g,b,a); \
+ } \
+ SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
+
+ #define STAGE_GP(name, ...) \
+ SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
+ U16& r, U16& g, U16& b, U16& a, \
+ U16& dr, U16& dg, U16& db, U16& da); \
+ static ABI void name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
+ auto x = join<F>(r,g), \
+ y = join<F>(b,a); \
+ name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a, \
+ params->dr,params->dg,params->db,params->da); \
+ auto next = (Stage)load_and_inc(program); \
+ next(params,program, r,g,b,a); \
+ } \
+ SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
+ U16& r, U16& g, U16& b, U16& a, \
+ U16& dr, U16& dg, U16& db, U16& da)
+
+ #define STAGE_PP(name, ...) \
+ SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
+ U16& r, U16& g, U16& b, U16& a, \
+ U16& dr, U16& dg, U16& db, U16& da); \
+ static ABI void name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
+ name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a, \
+ params->dr,params->dg,params->db,params->da); \
+ auto next = (Stage)load_and_inc(program); \
+ next(params,program, r,g,b,a); \
+ } \
+ SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
+ U16& r, U16& g, U16& b, U16& a, \
+ U16& dr, U16& dg, U16& db, U16& da)
+#else
+ #define STAGE_GG(name, ...) \
+ SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
+ static ABI void name(size_t tail, void** program, size_t dx, size_t dy, \
+ U16 r, U16 g, U16 b, U16 a, \
+ U16 dr, U16 dg, U16 db, U16 da) { \
+ auto x = join<F>(r,g), \
+ y = join<F>(b,a); \
+ name##_k(Ctx{program}, dx,dy,tail, x,y); \
+ split(x, &r,&g); \
+ split(y, &b,&a); \
+ auto next = (Stage)load_and_inc(program); \
+ next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
+ } \
+ SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
+
+ #define STAGE_GP(name, ...) \
+ SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
+ U16& r, U16& g, U16& b, U16& a, \
+ U16& dr, U16& dg, U16& db, U16& da); \
+ static ABI void name(size_t tail, void** program, size_t dx, size_t dy, \
+ U16 r, U16 g, U16 b, U16 a, \
+ U16 dr, U16 dg, U16 db, U16 da) { \
+ auto x = join<F>(r,g), \
+ y = join<F>(b,a); \
+ name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \
+ auto next = (Stage)load_and_inc(program); \
+ next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
+ } \
+ SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
+ U16& r, U16& g, U16& b, U16& a, \
+ U16& dr, U16& dg, U16& db, U16& da)
+
+ #define STAGE_PP(name, ...) \
+ SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
+ U16& r, U16& g, U16& b, U16& a, \
+ U16& dr, U16& dg, U16& db, U16& da); \
+ static ABI void name(size_t tail, void** program, size_t dx, size_t dy, \
+ U16 r, U16 g, U16 b, U16 a, \
+ U16 dr, U16 dg, U16 db, U16 da) { \
+ name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \
+ auto next = (Stage)load_and_inc(program); \
+ next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
+ } \
+ SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
+ U16& r, U16& g, U16& b, U16& a, \
+ U16& dr, U16& dg, U16& db, U16& da)
+#endif
// ~~~~~~ Commonly used helper functions ~~~~~~ //