8-bit jumper on armv8

The GM diffs are all minor and what you'd expect. I did a quick performance sanity check, which also looks fine. $ out/ok bench rp filter:search=Modulate [blendmode_rect_Modulate] 30.2ms @0 32ms @95 32ms @100 [blendmode_mask_Modulate] 12.6ms @0 12.6ms @95 14.5ms @100 ~~~> [blendmode_rect_Modulate] 11.2ms @0 11.7ms @95 12.4ms @100 [blendmode_mask_Modulate] 10.5ms @0 23.6ms @95 23.9ms @100 This isn't even really the fastest we can make 8-bit go on ARMv8; it's actually much more natural to work de-interlaced there. Lots of room to follow up. Change-Id: I86b1099f6742bcb0b8b4fa153e85eaba9567cbf7 Reviewed-on: https://skia-review.googlesource.com/39740 Reviewed-by: Florin Malita <fmalita@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
author: Mike Klein <mtklein@chromium.org> 2017-08-28 15:51:46 -0400
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2017-08-28 21:04:07 +0000
commit: 08133583d5e1cdfdcc41b4bb078fcfb64137f058 (patch)
tree: 213dbefcb5d7b2020eed2c19b6f33c281f6548d4 /src/jumper
parent: c5b2c86cd1fca9f17d814ae750378843ca0ce216 (diff)
2 files changed, 60 insertions, 19 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 9f8e970f32..315110faf2 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -110,7 +110,7 @@ using StartPipelineFn = void(size_t,size_t,size_t,size_t, void**,K*);
 extern "C" {
 
 #if __has_feature(memory_sanitizer)
-    // We'll just run portable code.
+    // We'll just run baseline code.
 
 #elif defined(__arm__)
     StartPipelineFn ASM(start_pipeline,vfp4);
@@ -168,12 +168,22 @@ extern "C" {
 
 #endif
 
-    // Portable, single-pixel stages.
+    // Baseline code compiled as a normal part of Skia.
     StartPipelineFn sk_start_pipeline;
     StageFn sk_just_return;
     #define M(st) StageFn sk_##st;
         SK_RASTER_PIPELINE_STAGES(M)
     #undef M
+
+#if defined(__clang__) && defined(__aarch64__)
+    // We also compile 8-bit stages on ARMv8 as a normal part of Skia when compiled with Clang.
+    StartPipelineFn sk_start_pipeline_8bit;
+    StageFn sk_just_return_8bit;
+    #define M(st) StageFn sk_##st##_8bit;
+        SK_RASTER_PIPELINE_STAGES(M)
+    #undef M
+#endif
+
 }
 
 #if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
@@ -198,6 +208,16 @@ extern "C" {
         }
         LOWP_STAGES(M)
     #undef M
+#elif defined(__clang__) && defined(__aarch64__)
+    template <SkRasterPipeline::StockStage st>
+    static constexpr StageFn* aarch64_8bit() { return nullptr; }
+
+    #define M(st)                                                               \
+        template <> constexpr StageFn* aarch64_8bit<SkRasterPipeline::st>() {   \
+            return sk_##st##_8bit;                                              \
+        }
+        LOWP_STAGES(M)
+    #undef M
 #endif
 
 // Engines comprise everything we need to run SkRasterPipelines.
@@ -207,20 +227,20 @@ struct SkJumper_Engine {
     StageFn*         just_return;
 };
 
-// We'll default to this portable engine, but try to choose a better one at runtime.
-static const SkJumper_Engine kPortable = {
+// We'll default to this baseline engine, but try to choose a better one at runtime.
+static const SkJumper_Engine kBaseline = {
 #define M(stage) sk_##stage,
     { SK_RASTER_PIPELINE_STAGES(M) },
 #undef M
     sk_start_pipeline,
     sk_just_return,
 };
-static SkJumper_Engine gEngine = kPortable;
+static SkJumper_Engine gEngine = kBaseline;
 static SkOnce gChooseEngineOnce;
 
 static SkJumper_Engine choose_engine() {
 #if __has_feature(memory_sanitizer)
-    // We'll just run portable code.
+    // We'll just run baseline code.
 
 #elif defined(__arm__)
     if (1 && SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
@@ -283,7 +303,7 @@ static SkJumper_Engine choose_engine() {
     }
 
 #endif
-    return kPortable;
+    return kBaseline;
 }
 
 #ifndef SK_JUMPER_DISABLE_8BIT
@@ -326,6 +346,14 @@ static SkJumper_Engine choose_engine() {
             #undef M
             };
         }
+    #elif defined(__clang__) && defined(__aarch64__)
+        return {
+        #define M(st) aarch64_8bit<SkRasterPipeline::st>(),
+            { SK_RASTER_PIPELINE_STAGES(M) },
+            sk_start_pipeline_8bit,
+            sk_just_return_8bit,
+        #undef M
+        };
     #endif
         return kNone;
     }
diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp
index 5c73ea8cbe..edd6689c8c 100644
--- a/src/jumper/SkJumper_stages_8bit.cpp
+++ b/src/jumper/SkJumper_stages_8bit.cpp
@@ -5,23 +5,27 @@
  * found in the LICENSE file.
  */
 
+// This restricted SkJumper backend works on 8-bit per channel interlaced
+// pixels.  This is the natural format for kN32_SkColorType buffers, and we
+// hope the stages in this file can replace many custom legacy routines.
+
 #include "SkJumper.h"
 #include "SkJumper_misc.h"
 
-#if defined(__SSE2__)
+// As an experiment we bake ARMv8 8-bit code in as normally compiled Skia code.
+// Any other platform (so far) is offline-only.
+#if defined(JUMPER_IS_OFFLINE) || (defined(__clang__) && defined(__aarch64__))
+
+#if defined(__aarch64__)
+    #include <arm_neon.h>
+#else
     #include <immintrin.h>
 #endif
 
-// This restricted SkJumper backend works on 8-bit per channel interlaced
-// pixels.  This is the natural format for kN32_SkColorType buffers, and we
-// hope the stages in this file can replace many custom legacy routines.
-
 #if !defined(JUMPER_IS_OFFLINE)
-    #error "This file must be pre-compiled."
+    #define WRAP(name) sk_##name##_8bit
 #elif defined(__aarch64__)
     #define WRAP(name) sk_##name##_aarch64_8bit
-#elif defined(__arm__)
-    #define WRAP(name) sk_##name##_vfp4_8bit
 #elif defined(__AVX2__)
     #define WRAP(name) sk_##name##_hsw_8bit
 #elif defined(__SSE4_1__)
@@ -112,7 +116,7 @@ SI V operator*(V x, V y) {
 
 template <typename T>
 SI T inv(T v) { return 0xff - v; }
-SI V two(V v) { return v + v; }
+
 SI V lerp(V from, V to, V t) { return to*t + from*inv(t); }
 
 SI V alpha(V v) {
@@ -162,10 +166,13 @@ SI V saturated_add(V a, V b) {
       b_lo, b_hi;
     split(a.u8x4, &a_lo, &a_hi);
     split(b.u8x4, &b_lo, &b_hi);
-#if defined(__AVX2__)
+#if defined(__aarch64__)
+    return join(vqaddq_u8(a_lo, b_lo),
+                vqaddq_u8(a_hi, b_hi));
+#elif defined(__AVX2__)
     return join(_mm256_adds_epu8(a_lo, b_lo),
                 _mm256_adds_epu8(a_hi, b_hi));
-#else
+#elif defined(__SSE2__)
     return join(_mm_adds_epu8(a_lo, b_lo),
                 _mm_adds_epu8(a_hi, b_hi));
 #endif
@@ -185,7 +192,11 @@ using Stage = void(const Params* params, void** program, R src_lo, R src_hi, R d
 MAYBE_MSABI
 extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit,
                                      void** program, const SkJumper_constants*) {
-    R r;
+#if defined(JUMPER_IS_OFFLINE)
+    R r;      // Fastest to start uninitialized.
+#else
+    R r{};    // Next best is zero'd for compilers that will complain about uninitialized values.
+#endif
     auto start = (Stage*)load_and_inc(program);
     for (; y < ylimit; y++) {
         Params params = { x,y,0 };
@@ -461,3 +472,5 @@ STAGE(overlay) {
 //   colorburn  |
 //   colordodge  > these involve division, which makes them (much) slower than the float stages.
 //   softlight  |
+
+#endif
author	Mike Klein <mtklein@chromium.org>	2017-08-28 15:51:46 -0400
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2017-08-28 21:04:07 +0000
commit	08133583d5e1cdfdcc41b4bb078fcfb64137f058 (patch)
tree	213dbefcb5d7b2020eed2c19b6f33c281f6548d4 /src/jumper
parent	c5b2c86cd1fca9f17d814ae750378843ca0ce216 (diff)