rearrange SkJumper registers on 32-bit x86

There are not many registers on 32-bit x86, and we're using most to pass Stage function arguments. This means few are available as temporaries, and we're forced to hit the stack all the time. xmm registers are the most egregious example: we use all 8 registers pass data, leaving none free as temporaries. This CL cuts things down pretty dramatically, from passing 5 general purpose and 8 xmm registers to 2 general purpose and 4 xmm registers. One of the two general purpose registers is a pointer to space on the stack where we store all those other values. Every stage function needs to use the program pointer, so that stays in a general purpose register. Almost every stage uses the r,g,b,a vectors, so they stay in xmm registers. The rest (destination x,y, the tail mask, a pointer to tricky constants, and the dr,dg,db,da vectors) now live on the stack. The generated code is about 20K smaller and runs about 20% faster. $ out/monobench SkRasterPipeline_srgb 200 Before: 358.784ns After: 282.563ns Change-Id: Icc117af95c1a81c41109984b32e0841022f0d1a6 Reviewed-on: https://skia-review.googlesource.com/27620 Reviewed-by: Florin Malita <fmalita@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
author: Mike Klein <mtklein@chromium.org> 2017-07-27 12:31:34 -0400
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2017-07-27 18:33:13 +0000
commit: f1b24e0dec93ad2237cf424a797bb88c76b6356b (patch)
tree: 0a282f9c95558850cf9ce8015eff00b847e776a5 /src/jumper/SkJumper_stages.cpp
parent: 7311b40efb0e1a75f3024eac3386e4f7d806675a (diff)
1 files changed, 62 insertions, 17 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index 4d78e9d0f3..47e3ad22ea 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -42,9 +42,22 @@ using K = const SkJumper_constants;
 //    tail == 0 ~~> work on a full kStride pixels
 //    tail != 0 ~~> work on only the first tail pixels
 // tail is always < kStride.
-//
-// We keep program the second argument, so that it's passed in rsi for load_and_inc().
-using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F);
+
+#if defined(__i386__) || defined(_M_IX86)
+    // On 32-bit x86 we've only got 8 xmm registers, so we keep the 4 hottest (r,g,b,a)
+    // in registers and the d-registers on the stack (giving us 4 temporary registers).
+    // General-purpose registers are also tight, so we put most of those on the stack too.
+    struct Params {
+        size_t x, y, tail;
+        K* k;
+        F dr,dg,db,da;
+    };
+    using Stage = void(Params*, void** program, F r, F g, F b, F a);
+
+#else
+    // We keep program the second argument, so that it's passed in rsi for load_and_inc().
+    using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F);
+#endif
 
 #if defined(JUMPER) && defined(__AVX__)
     // We really want to make sure all paths go through this function's (implicit) vzeroupper.
@@ -60,8 +73,19 @@ extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t y
     F v{};
 #endif
     auto start = (Stage*)load_and_inc(program);
-    size_t x0 = x;
+    const size_t x0 = x;
     for (; y < ylimit; y++) {
+    #if defined(__i386__) || defined(_M_IX86)
+        Params params = { x0,y,0,k, v,v,v,v };
+        while (params.x + kStride <= xlimit) {
+            start(&params,program, v,v,v,v);
+            params.x += kStride;
+        }
+        if (size_t tail = xlimit - params.x) {
+            params.tail = tail;
+            start(&params,program, v,v,v,v);
+        }
+    #else
         x = x0;
         while (x + kStride <= xlimit) {
             start(k,program,x,y,0,    v,v,v,v, v,v,v,v);
@@ -70,26 +94,47 @@ extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t y
         if (size_t tail = xlimit - x) {
             start(k,program,x,y,tail, v,v,v,v, v,v,v,v);
         }
+    #endif
     }
 }
 
-#define STAGE(name)                                                                   \
-    SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail,              \
-                     F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);             \
-    extern "C" void WRAP(name)(K* k, void** program, size_t x, size_t y, size_t tail, \
-                               F r, F g, F b, F a, F dr, F dg, F db, F da) {          \
-        LazyCtx ctx(program);                                                         \
-        name##_k(k,ctx,x,y,tail, r,g,b,a, dr,dg,db,da);                               \
-        auto next = (Stage*)load_and_inc(program);                                    \
-        next(k,program,x,y,tail, r,g,b,a, dr,dg,db,da);                               \
-    }                                                                                 \
-    SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail,              \
-                     F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+#if defined(__i386__) || defined(_M_IX86)
+    #define STAGE(name)                                                                   \
+        SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail,              \
+                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);             \
+        extern "C" void WRAP(name)(Params* params, void** program,                        \
+                                   F r, F g, F b, F a) {                                  \
+            LazyCtx ctx(program);                                                         \
+            name##_k(params->k,ctx,params->x,params->y,params->tail, r,g,b,a,             \
+                     params->dr, params->dg, params->db, params->da);                     \
+            auto next = (Stage*)load_and_inc(program);                                    \
+            next(params,program, r,g,b,a);                                                \
+        }                                                                                 \
+        SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail,              \
+                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+#else
+    #define STAGE(name)                                                                   \
+        SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail,              \
+                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);             \
+        extern "C" void WRAP(name)(K* k, void** program, size_t x, size_t y, size_t tail, \
+                                   F r, F g, F b, F a, F dr, F dg, F db, F da) {          \
+            LazyCtx ctx(program);                                                         \
+            name##_k(k,ctx,x,y,tail, r,g,b,a, dr,dg,db,da);                               \
+            auto next = (Stage*)load_and_inc(program);                                    \
+            next(k,program,x,y,tail, r,g,b,a, dr,dg,db,da);                               \
+        }                                                                                 \
+        SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail,              \
+                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+#endif
 
 
 // just_return() is a simple no-op stage that only exists to end the chain,
 // returning back up to start_pipeline(), and from there to the caller.
-extern "C" void WRAP(just_return)(K*, void**, size_t,size_t,size_t, F,F,F,F, F,F,F,F) {}
+#if defined(__i386__) || defined(_M_IX86)
+    extern "C" void WRAP(just_return)(Params*, void**, F,F,F,F) {}
+#else
+    extern "C" void WRAP(just_return)(K*, void**, size_t,size_t,size_t, F,F,F,F, F,F,F,F) {}
+#endif
 
 
 // We could start defining normal Stages now.  But first, some helper functions.
author	Mike Klein <mtklein@chromium.org>	2017-07-27 12:31:34 -0400
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2017-07-27 18:33:13 +0000
commit	f1b24e0dec93ad2237cf424a797bb88c76b6356b (patch)
tree	0a282f9c95558850cf9ce8015eff00b847e776a5 /src/jumper/SkJumper_stages.cpp
parent	7311b40efb0e1a75f3024eac3386e4f7d806675a (diff)