diff options
author | Mike Klein <mtklein@chromium.org> | 2017-08-28 15:51:46 -0400 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2017-08-28 21:04:07 +0000 |
commit | 08133583d5e1cdfdcc41b4bb078fcfb64137f058 (patch) | |
tree | 213dbefcb5d7b2020eed2c19b6f33c281f6548d4 /src/jumper | |
parent | c5b2c86cd1fca9f17d814ae750378843ca0ce216 (diff) |
8-bit jumper on armv8
The GM diffs are all minor and what you'd expect.
I did a quick performance sanity check, which also looks fine.
$ out/ok bench rp filter:search=Modulate
[blendmode_rect_Modulate] 30.2ms @0 32ms @95 32ms @100
[blendmode_mask_Modulate] 12.6ms @0 12.6ms @95 14.5ms @100
~~~>
[blendmode_rect_Modulate] 11.2ms @0 11.7ms @95 12.4ms @100
[blendmode_mask_Modulate] 10.5ms @0 23.6ms @95 23.9ms @100
This isn't even really the fastest we can make 8-bit go on ARMv8;
it's actually much more natural to work de-interlaced there. Lots
of room to follow up.
Change-Id: I86b1099f6742bcb0b8b4fa153e85eaba9567cbf7
Reviewed-on: https://skia-review.googlesource.com/39740
Reviewed-by: Florin Malita <fmalita@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/jumper')
-rw-r--r-- | src/jumper/SkJumper.cpp | 42 | ||||
-rw-r--r-- | src/jumper/SkJumper_stages_8bit.cpp | 37 |
2 files changed, 60 insertions, 19 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp index 9f8e970f32..315110faf2 100644 --- a/src/jumper/SkJumper.cpp +++ b/src/jumper/SkJumper.cpp @@ -110,7 +110,7 @@ using StartPipelineFn = void(size_t,size_t,size_t,size_t, void**,K*); extern "C" { #if __has_feature(memory_sanitizer) - // We'll just run portable code. + // We'll just run baseline code. #elif defined(__arm__) StartPipelineFn ASM(start_pipeline,vfp4); @@ -168,12 +168,22 @@ extern "C" { #endif - // Portable, single-pixel stages. + // Baseline code compiled as a normal part of Skia. StartPipelineFn sk_start_pipeline; StageFn sk_just_return; #define M(st) StageFn sk_##st; SK_RASTER_PIPELINE_STAGES(M) #undef M + +#if defined(__clang__) && defined(__aarch64__) + // We also compile 8-bit stages on ARMv8 as a normal part of Skia when compiled with Clang. + StartPipelineFn sk_start_pipeline_8bit; + StageFn sk_just_return_8bit; + #define M(st) StageFn sk_##st##_8bit; + SK_RASTER_PIPELINE_STAGES(M) + #undef M +#endif + } #if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64)) @@ -198,6 +208,16 @@ extern "C" { } LOWP_STAGES(M) #undef M +#elif defined(__clang__) && defined(__aarch64__) + template <SkRasterPipeline::StockStage st> + static constexpr StageFn* aarch64_8bit() { return nullptr; } + + #define M(st) \ + template <> constexpr StageFn* aarch64_8bit<SkRasterPipeline::st>() { \ + return sk_##st##_8bit; \ + } + LOWP_STAGES(M) + #undef M #endif // Engines comprise everything we need to run SkRasterPipelines. @@ -207,20 +227,20 @@ struct SkJumper_Engine { StageFn* just_return; }; -// We'll default to this portable engine, but try to choose a better one at runtime. -static const SkJumper_Engine kPortable = { +// We'll default to this baseline engine, but try to choose a better one at runtime. +static const SkJumper_Engine kBaseline = { #define M(stage) sk_##stage, { SK_RASTER_PIPELINE_STAGES(M) }, #undef M sk_start_pipeline, sk_just_return, }; -static SkJumper_Engine gEngine = kPortable; +static SkJumper_Engine gEngine = kBaseline; static SkOnce gChooseEngineOnce; static SkJumper_Engine choose_engine() { #if __has_feature(memory_sanitizer) - // We'll just run portable code. + // We'll just run baseline code. #elif defined(__arm__) if (1 && SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) { @@ -283,7 +303,7 @@ static SkJumper_Engine choose_engine() { } #endif - return kPortable; + return kBaseline; } #ifndef SK_JUMPER_DISABLE_8BIT @@ -326,6 +346,14 @@ static SkJumper_Engine choose_engine() { #undef M }; } + #elif defined(__clang__) && defined(__aarch64__) + return { + #define M(st) aarch64_8bit<SkRasterPipeline::st>(), + { SK_RASTER_PIPELINE_STAGES(M) }, + sk_start_pipeline_8bit, + sk_just_return_8bit, + #undef M + }; #endif return kNone; } diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp index 5c73ea8cbe..edd6689c8c 100644 --- a/src/jumper/SkJumper_stages_8bit.cpp +++ b/src/jumper/SkJumper_stages_8bit.cpp @@ -5,23 +5,27 @@ * found in the LICENSE file. */ +// This restricted SkJumper backend works on 8-bit per channel interlaced +// pixels. This is the natural format for kN32_SkColorType buffers, and we +// hope the stages in this file can replace many custom legacy routines. + #include "SkJumper.h" #include "SkJumper_misc.h" -#if defined(__SSE2__) +// As an experiment we bake ARMv8 8-bit code in as normally compiled Skia code. +// Any other platform (so far) is offline-only. +#if defined(JUMPER_IS_OFFLINE) || (defined(__clang__) && defined(__aarch64__)) + +#if defined(__aarch64__) + #include <arm_neon.h> +#else #include <immintrin.h> #endif -// This restricted SkJumper backend works on 8-bit per channel interlaced -// pixels. This is the natural format for kN32_SkColorType buffers, and we -// hope the stages in this file can replace many custom legacy routines. - #if !defined(JUMPER_IS_OFFLINE) - #error "This file must be pre-compiled." + #define WRAP(name) sk_##name##_8bit #elif defined(__aarch64__) #define WRAP(name) sk_##name##_aarch64_8bit -#elif defined(__arm__) - #define WRAP(name) sk_##name##_vfp4_8bit #elif defined(__AVX2__) #define WRAP(name) sk_##name##_hsw_8bit #elif defined(__SSE4_1__) @@ -112,7 +116,7 @@ SI V operator*(V x, V y) { template <typename T> SI T inv(T v) { return 0xff - v; } -SI V two(V v) { return v + v; } + SI V lerp(V from, V to, V t) { return to*t + from*inv(t); } SI V alpha(V v) { @@ -162,10 +166,13 @@ SI V saturated_add(V a, V b) { b_lo, b_hi; split(a.u8x4, &a_lo, &a_hi); split(b.u8x4, &b_lo, &b_hi); -#if defined(__AVX2__) +#if defined(__aarch64__) + return join(vqaddq_u8(a_lo, b_lo), + vqaddq_u8(a_hi, b_hi)); +#elif defined(__AVX2__) return join(_mm256_adds_epu8(a_lo, b_lo), _mm256_adds_epu8(a_hi, b_hi)); -#else +#elif defined(__SSE2__) return join(_mm_adds_epu8(a_lo, b_lo), _mm_adds_epu8(a_hi, b_hi)); #endif @@ -185,7 +192,11 @@ using Stage = void(const Params* params, void** program, R src_lo, R src_hi, R d MAYBE_MSABI extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit, void** program, const SkJumper_constants*) { - R r; +#if defined(JUMPER_IS_OFFLINE) + R r; // Fastest to start uninitialized. +#else + R r{}; // Next best is zero'd for compilers that will complain about uninitialized values. +#endif auto start = (Stage*)load_and_inc(program); for (; y < ylimit; y++) { Params params = { x,y,0 }; @@ -461,3 +472,5 @@ STAGE(overlay) { // colorburn | // colordodge > these involve division, which makes them (much) slower than the float stages. // softlight | + +#endif |