diff options
author | Mike Klein <mtklein@chromium.org> | 2016-10-06 15:06:38 -0400 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2016-10-07 12:52:29 +0000 |
commit | 1aebdaee0e2aa4324509fd3ad4c40c21703ae4a2 (patch) | |
tree | c5ffae6c59217f3d228891177e1d50d7f784801a /src/opts/SkOpts_hsw.cpp | |
parent | 2766cc567d5c939730fadd2d865e4bdf05477263 (diff) |
SkRasterPipeline: 8x pipelines
Bench runtime changes:
sRGB: 7194 -> 3735 = 1.93x faster
F16: 6531 -> 2559 = 2.55x faster
Instead of building 4x and 1-3x pipelines and then maybe 8x and 1-7x, instead build either the short ones or the long ones, but not both. If we just take care to use a compatible run_pipeline(), there's some cross-module type disagreement but everything works out in the end.
Oddly, a few places that looked like they'd be faster using SkNx_fma() or Sk4f_round()/Sk8f_round() are actually faster the long way, e.g. multiply, add 0.5, truncate. Curious! In all the other places you see here that I've used SkNx_fma(), it's been a significant speedup.
This folds in a couple refactors and cleanups that I've been meaning to do. Hope you don't mind... if find the new code considerably easier to read than the old code.
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2990
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Change-Id: I1c82e5755d8e44cc0b9c6673d04b117f85d71a3a
Reviewed-on: https://skia-review.googlesource.com/2990
Reviewed-by: Matt Sarett <msarett@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/opts/SkOpts_hsw.cpp')
-rw-r--r-- | src/opts/SkOpts_hsw.cpp | 74 |
1 files changed, 73 insertions, 1 deletions
diff --git a/src/opts/SkOpts_hsw.cpp b/src/opts/SkOpts_hsw.cpp index 53e2e5acdd..c994bf6534 100644 --- a/src/opts/SkOpts_hsw.cpp +++ b/src/opts/SkOpts_hsw.cpp @@ -7,9 +7,81 @@ #include "SkOpts.h" + #define SK_OPTS_NS hsw +#include "SkRasterPipeline_opts.h" namespace SkOpts { - void Init_hsw() { } + void Init_hsw() { + +// The 32-bit MSVC __vectorcall ABI mangles type information into the names of +// SkOpts::body, SkOpts::tail, and SkOpts::run_pipeline, so that this code will +// not link as written: they're all defined in a file where SkRasterPipeline::V +// is Sk4f, but here we're seeing it as Sk8f. +// +// We can work around this by storing those pointers as some generic function +// pointer type like void(*)(), but it's even simpler to just not do any of this +// when targeting 32-bit Windows. +#if !defined(_M_IX86) + + run_pipeline = SK_OPTS_NS::run_pipeline; + + #define STAGE(stage) \ + body[SkRasterPipeline::stage] = SK_OPTS_NS::stage; \ + tail[SkRasterPipeline::stage] = SK_OPTS_NS::stage##_tail + + STAGE(store_565); + STAGE(store_srgb); + STAGE(store_f16); + + STAGE(load_s_565); + STAGE(load_s_srgb); + STAGE(load_s_f16); + + STAGE(load_d_565); + STAGE(load_d_srgb); + STAGE(load_d_f16); + + STAGE(scale_u8); + + STAGE(lerp_u8); + STAGE(lerp_565); + #undef STAGE + + #define STAGE(stage) \ + body[SkRasterPipeline::stage] = SK_OPTS_NS::stage; \ + tail[SkRasterPipeline::stage] = SK_OPTS_NS::stage + + STAGE(lerp_constant_float); + STAGE(constant_color); + + STAGE(dst); + STAGE(dstatop); + STAGE(dstin); + STAGE(dstout); + STAGE(dstover); + STAGE(srcatop); + STAGE(srcin); + STAGE(srcout); + STAGE(srcover); + STAGE(clear); + STAGE(modulate); + STAGE(multiply); + STAGE(plus_); + STAGE(screen); + STAGE(xor_); + STAGE(colorburn); + STAGE(colordodge); + STAGE(darken); + STAGE(difference); + STAGE(exclusion); + STAGE(hardlight); + STAGE(lighten); + STAGE(overlay); + STAGE(softlight); + #undef STAGE + +#endif // !defined(_M_IX86) + } } |