follow JUMPER_NARROW_STAGES in lowp stages too

Should give dramatically better codegen for all 32-bit builds and 64-bit Windows builds, bringing it in line with how we make highp float stages. May help this bug, which is mostly Windows perf regressions. Bug: chromium:820469 Change-Id: I223f7568a09dea28ec614b18555766ea7d8365fa Reviewed-on: https://skia-review.googlesource.com/113665 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
author: Mike Klein <mtklein@chromium.org> 2018-03-10 10:27:24 -0500
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2018-03-10 16:07:18 +0000
commit: a46623b2f3bbf09822537c63a1d64f2e245c6ff0 (patch)
tree: 206fb473fb0be6eea8eb53740f648f5203a1644e /src/opts
parent: e3dc6a93466f73ccb5fba5364ec87df3aba4a1e2 (diff)
1 files changed, 124 insertions, 55 deletions
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 634d0fbccc..edc50732f3 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -704,7 +704,7 @@ static const size_t N = sizeof(F) / sizeof(float);
 #if defined(__ARM_NEON) && defined(__arm__)
     // This lets us pass vectors more efficiently on 32-bit ARM.
     #define ABI __attribute__((pcs("aapcs-vfp")))
-#elif defined(__clang__) && defined(_MSC_VER)
+#elif 0 || defined(__clang__) && defined(_MSC_VER)
     // TODO: can we use sysv_abi here instead?  It'd allow passing far more registers.
     #define ABI __attribute__((vectorcall))
 #else
@@ -718,7 +718,7 @@ static const size_t N = sizeof(F) / sizeof(float);
 // On ARMv7, we do the same so that we can make the r,g,b,a vectors wider.
 //
 // Finally, this narrower stage calling convention also fits Windows' __vectorcall very well.
-#if defined(__i386__) || defined(_M_IX86) || defined(__arm__) || defined(_MSC_VER)
+#if 0 || defined(__i386__) || defined(_M_IX86) || defined(__arm__) || defined(_MSC_VER)
     #define JUMPER_NARROW_STAGES 1
 #else
     #define JUMPER_NARROW_STAGES 0
@@ -2329,17 +2329,36 @@ namespace lowp {
 
 static const size_t N = sizeof(U16) / sizeof(uint16_t);
 
-// TODO: follow the guidance of JUMPER_NARROW_STAGES for lowp stages too.
-
-// We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64.
-using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
-                          U16  r, U16  g, U16  b, U16  a,
-                          U16 dr, U16 dg, U16 db, U16 da);
+// Once again, some platforms benefit from a restricted Stage calling convention,
+// but others can pass tons and tons of registers and we're happy to exploit that.
+// It's exactly the same decision and implementation strategy as the F stages above.
+#if JUMPER_NARROW_STAGES
+    struct Params {
+        size_t dx, dy, tail;
+        U16 dr,dg,db,da;
+    };
+    using Stage = void(ABI*)(Params*, void** program, U16 r, U16 g, U16 b, U16 a);
+#else
+    // We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64.
+    using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
+                              U16  r, U16  g, U16  b, U16  a,
+                              U16 dr, U16 dg, U16 db, U16 da);
+#endif
 
 static void start_pipeline(const size_t x0,     const size_t y0,
                            const size_t xlimit, const size_t ylimit, void** program) {
     auto start = (Stage)load_and_inc(program);
     for (size_t dy = y0; dy < ylimit; dy++) {
+    #if JUMPER_NARROW_STAGES
+        Params params = { x0,dy,0, 0,0,0,0 };
+        for (; params.dx + N <= xlimit; params.dx += N) {
+            start(&params,program, 0,0,0,0);
+        }
+        if (size_t tail = xlimit - params.dx) {
+            params.tail = tail;
+            start(&params,program, 0,0,0,0);
+        }
+    #else
         size_t dx = x0;
         for (; dx + N <= xlimit; dx += N) {
             start(   0,program,dx,dy, 0,0,0,0, 0,0,0,0);
@@ -2347,10 +2366,15 @@ static void start_pipeline(const size_t x0,     const size_t y0,
         if (size_t tail = xlimit - dx) {
             start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
         }
+    #endif
     }
 }
 
-static ABI void just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {}
+#if JUMPER_NARROW_STAGES
+    static ABI void just_return(Params*, void**, U16,U16,U16,U16) {}
+#else
+    static ABI void just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {}
+#endif
 
 // All stages use the same function call ABI to chain into each other, but there are three types:
 //   GG: geometry in, geometry out  -- think, a matrix
@@ -2362,52 +2386,97 @@ static ABI void just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U1
 // These three STAGE_ macros let you define each type of stage,
 // and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate.
 
-#define STAGE_GG(name, ...)                                                            \
-    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y);      \
-    static ABI void name(size_t tail, void** program, size_t dx, size_t dy,            \
-                         U16  r, U16  g, U16  b, U16  a,                               \
-                         U16 dr, U16 dg, U16 db, U16 da) {                             \
-        auto x = join<F>(r,g),                                                         \
-             y = join<F>(b,a);                                                         \
-        name##_k(Ctx{program}, dx,dy,tail, x,y);                                       \
-        split(x, &r,&g);                                                               \
-        split(y, &b,&a);                                                               \
-        auto next = (Stage)load_and_inc(program);                                      \
-        next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
-    }                                                                                  \
-    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
-
-#define STAGE_GP(name, ...)                                                            \
-    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
-                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
-                     U16& dr, U16& dg, U16& db, U16& da);                              \
-    static ABI void name(size_t tail, void** program, size_t dx, size_t dy,            \
-                         U16  r, U16  g, U16  b, U16  a,                               \
-                         U16 dr, U16 dg, U16 db, U16 da) {                             \
-        auto x = join<F>(r,g),                                                         \
-             y = join<F>(b,a);                                                         \
-        name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da);                 \
-        auto next = (Stage)load_and_inc(program);                                      \
-        next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
-    }                                                                                  \
-    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
-                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
-                     U16& dr, U16& dg, U16& db, U16& da)
-
-#define STAGE_PP(name, ...)                                                            \
-    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
-                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
-                     U16& dr, U16& dg, U16& db, U16& da);                              \
-    static ABI void name(size_t tail, void** program, size_t dx, size_t dy,            \
-                         U16  r, U16  g, U16  b, U16  a,                               \
-                         U16 dr, U16 dg, U16 db, U16 da) {                             \
-        name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da);                      \
-        auto next = (Stage)load_and_inc(program);                                      \
-        next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
-    }                                                                                  \
-    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
-                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
-                     U16& dr, U16& dg, U16& db, U16& da)
+#if JUMPER_NARROW_STAGES
+    #define STAGE_GG(name, ...)                                                            \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y);      \
+        static ABI void name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
+            auto x = join<F>(r,g),                                                         \
+                 y = join<F>(b,a);                                                         \
+            name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y);               \
+            split(x, &r,&g);                                                               \
+            split(y, &b,&a);                                                               \
+            auto next = (Stage)load_and_inc(program);                                      \
+            next(params,program, r,g,b,a);                                                 \
+        }                                                                                  \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
+
+    #define STAGE_GP(name, ...)                                                            \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
+                         U16&  r, U16&  g, U16&  b, U16&  a,                               \
+                         U16& dr, U16& dg, U16& db, U16& da);                              \
+        static ABI void name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
+            auto x = join<F>(r,g),                                                         \
+                 y = join<F>(b,a);                                                         \
+            name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a,       \
+                     params->dr,params->dg,params->db,params->da);                         \
+            auto next = (Stage)load_and_inc(program);                                      \
+            next(params,program, r,g,b,a);                                                 \
+        }                                                                                  \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
+                         U16&  r, U16&  g, U16&  b, U16&  a,                               \
+                         U16& dr, U16& dg, U16& db, U16& da)
+
+    #define STAGE_PP(name, ...)                                                            \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
+                         U16&  r, U16&  g, U16&  b, U16&  a,                               \
+                         U16& dr, U16& dg, U16& db, U16& da);                              \
+        static ABI void name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
+            name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a,            \
+                     params->dr,params->dg,params->db,params->da);                         \
+            auto next = (Stage)load_and_inc(program);                                      \
+            next(params,program, r,g,b,a);                                                 \
+        }                                                                                  \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
+                         U16&  r, U16&  g, U16&  b, U16&  a,                               \
+                         U16& dr, U16& dg, U16& db, U16& da)
+#else
+    #define STAGE_GG(name, ...)                                                            \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y);      \
+        static ABI void name(size_t tail, void** program, size_t dx, size_t dy,            \
+                             U16  r, U16  g, U16  b, U16  a,                               \
+                             U16 dr, U16 dg, U16 db, U16 da) {                             \
+            auto x = join<F>(r,g),                                                         \
+                 y = join<F>(b,a);                                                         \
+            name##_k(Ctx{program}, dx,dy,tail, x,y);                                       \
+            split(x, &r,&g);                                                               \
+            split(y, &b,&a);                                                               \
+            auto next = (Stage)load_and_inc(program);                                      \
+            next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
+        }                                                                                  \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
+
+    #define STAGE_GP(name, ...)                                                            \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
+                         U16&  r, U16&  g, U16&  b, U16&  a,                               \
+                         U16& dr, U16& dg, U16& db, U16& da);                              \
+        static ABI void name(size_t tail, void** program, size_t dx, size_t dy,            \
+                             U16  r, U16  g, U16  b, U16  a,                               \
+                             U16 dr, U16 dg, U16 db, U16 da) {                             \
+            auto x = join<F>(r,g),                                                         \
+                 y = join<F>(b,a);                                                         \
+            name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da);                 \
+            auto next = (Stage)load_and_inc(program);                                      \
+            next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
+        }                                                                                  \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
+                         U16&  r, U16&  g, U16&  b, U16&  a,                               \
+                         U16& dr, U16& dg, U16& db, U16& da)
+
+    #define STAGE_PP(name, ...)                                                            \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
+                         U16&  r, U16&  g, U16&  b, U16&  a,                               \
+                         U16& dr, U16& dg, U16& db, U16& da);                              \
+        static ABI void name(size_t tail, void** program, size_t dx, size_t dy,            \
+                             U16  r, U16  g, U16  b, U16  a,                               \
+                             U16 dr, U16 dg, U16 db, U16 da) {                             \
+            name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da);                      \
+            auto next = (Stage)load_and_inc(program);                                      \
+            next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
+        }                                                                                  \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
+                         U16&  r, U16&  g, U16&  b, U16&  a,                               \
+                         U16& dr, U16& dg, U16& db, U16& da)
+#endif
 
 // ~~~~~~ Commonly used helper functions ~~~~~~ //
author	Mike Klein <mtklein@chromium.org>	2018-03-10 10:27:24 -0500
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2018-03-10 16:07:18 +0000
commit	a46623b2f3bbf09822537c63a1d64f2e245c6ff0 (patch)
tree	206fb473fb0be6eea8eb53740f648f5203a1644e /src/opts
parent	e3dc6a93466f73ccb5fba5364ec87df3aba4a1e2 (diff)