diff options
author | 2017-07-27 12:31:34 -0400 | |
---|---|---|
committer | 2017-07-27 18:33:13 +0000 | |
commit | f1b24e0dec93ad2237cf424a797bb88c76b6356b (patch) | |
tree | 0a282f9c95558850cf9ce8015eff00b847e776a5 /src/jumper/SkJumper_stages.cpp | |
parent | 7311b40efb0e1a75f3024eac3386e4f7d806675a (diff) |
rearrange SkJumper registers on 32-bit x86
There are not many registers on 32-bit x86, and we're using most to pass
Stage function arguments. This means few are available as temporaries,
and we're forced to hit the stack all the time. xmm registers are the
most egregious example: we use all 8 registers pass data, leaving none
free as temporaries.
This CL cuts things down pretty dramatically, from passing 5 general
purpose and 8 xmm registers to 2 general purpose and 4 xmm registers.
One of the two general purpose registers is a pointer to space on the
stack where we store all those other values.
Every stage function needs to use the program pointer, so that stays in
a general purpose register. Almost every stage uses the r,g,b,a
vectors, so they stay in xmm registers. The rest (destination x,y, the
tail mask, a pointer to tricky constants, and the dr,dg,db,da vectors)
now live on the stack.
The generated code is about 20K smaller and runs about 20% faster.
$ out/monobench SkRasterPipeline_srgb 200
Before: 358.784ns
After: 282.563ns
Change-Id: Icc117af95c1a81c41109984b32e0841022f0d1a6
Reviewed-on: https://skia-review.googlesource.com/27620
Reviewed-by: Florin Malita <fmalita@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/jumper/SkJumper_stages.cpp')
-rw-r--r-- | src/jumper/SkJumper_stages.cpp | 79 |
1 files changed, 62 insertions, 17 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index 4d78e9d0f3..47e3ad22ea 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -42,9 +42,22 @@ using K = const SkJumper_constants; // tail == 0 ~~> work on a full kStride pixels // tail != 0 ~~> work on only the first tail pixels // tail is always < kStride. -// -// We keep program the second argument, so that it's passed in rsi for load_and_inc(). -using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F); + +#if defined(__i386__) || defined(_M_IX86) + // On 32-bit x86 we've only got 8 xmm registers, so we keep the 4 hottest (r,g,b,a) + // in registers and the d-registers on the stack (giving us 4 temporary registers). + // General-purpose registers are also tight, so we put most of those on the stack too. + struct Params { + size_t x, y, tail; + K* k; + F dr,dg,db,da; + }; + using Stage = void(Params*, void** program, F r, F g, F b, F a); + +#else + // We keep program the second argument, so that it's passed in rsi for load_and_inc(). + using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F); +#endif #if defined(JUMPER) && defined(__AVX__) // We really want to make sure all paths go through this function's (implicit) vzeroupper. @@ -60,8 +73,19 @@ extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t y F v{}; #endif auto start = (Stage*)load_and_inc(program); - size_t x0 = x; + const size_t x0 = x; for (; y < ylimit; y++) { + #if defined(__i386__) || defined(_M_IX86) + Params params = { x0,y,0,k, v,v,v,v }; + while (params.x + kStride <= xlimit) { + start(¶ms,program, v,v,v,v); + params.x += kStride; + } + if (size_t tail = xlimit - params.x) { + params.tail = tail; + start(¶ms,program, v,v,v,v); + } + #else x = x0; while (x + kStride <= xlimit) { start(k,program,x,y,0, v,v,v,v, v,v,v,v); @@ -70,26 +94,47 @@ extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t y if (size_t tail = xlimit - x) { start(k,program,x,y,tail, v,v,v,v, v,v,v,v); } + #endif } } -#define STAGE(name) \ - SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail, \ - F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ - extern "C" void WRAP(name)(K* k, void** program, size_t x, size_t y, size_t tail, \ - F r, F g, F b, F a, F dr, F dg, F db, F da) { \ - LazyCtx ctx(program); \ - name##_k(k,ctx,x,y,tail, r,g,b,a, dr,dg,db,da); \ - auto next = (Stage*)load_and_inc(program); \ - next(k,program,x,y,tail, r,g,b,a, dr,dg,db,da); \ - } \ - SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail, \ - F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) +#if defined(__i386__) || defined(_M_IX86) + #define STAGE(name) \ + SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail, \ + F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ + extern "C" void WRAP(name)(Params* params, void** program, \ + F r, F g, F b, F a) { \ + LazyCtx ctx(program); \ + name##_k(params->k,ctx,params->x,params->y,params->tail, r,g,b,a, \ + params->dr, params->dg, params->db, params->da); \ + auto next = (Stage*)load_and_inc(program); \ + next(params,program, r,g,b,a); \ + } \ + SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail, \ + F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) +#else + #define STAGE(name) \ + SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail, \ + F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ + extern "C" void WRAP(name)(K* k, void** program, size_t x, size_t y, size_t tail, \ + F r, F g, F b, F a, F dr, F dg, F db, F da) { \ + LazyCtx ctx(program); \ + name##_k(k,ctx,x,y,tail, r,g,b,a, dr,dg,db,da); \ + auto next = (Stage*)load_and_inc(program); \ + next(k,program,x,y,tail, r,g,b,a, dr,dg,db,da); \ + } \ + SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail, \ + F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) +#endif // just_return() is a simple no-op stage that only exists to end the chain, // returning back up to start_pipeline(), and from there to the caller. -extern "C" void WRAP(just_return)(K*, void**, size_t,size_t,size_t, F,F,F,F, F,F,F,F) {} +#if defined(__i386__) || defined(_M_IX86) + extern "C" void WRAP(just_return)(Params*, void**, F,F,F,F) {} +#else + extern "C" void WRAP(just_return)(K*, void**, size_t,size_t,size_t, F,F,F,F, F,F,F,F) {} +#endif // We could start defining normal Stages now. But first, some helper functions. |