Revert "8-bit jumper on armv8"

This reverts commit 08133583d5e1cdfdcc41b4bb078fcfb64137f058. Reason for revert: Blocking Android Autoroller on compile error. Original change's description: > 8-bit jumper on armv8 > > The GM diffs are all minor and what you'd expect. > > I did a quick performance sanity check, which also looks fine. > > $ out/ok bench rp filter:search=Modulate > [blendmode_rect_Modulate] 30.2ms @0 32ms @95 32ms @100 > [blendmode_mask_Modulate] 12.6ms @0 12.6ms @95 14.5ms @100 > ~~~> > [blendmode_rect_Modulate] 11.2ms @0 11.7ms @95 12.4ms @100 > [blendmode_mask_Modulate] 10.5ms @0 23.6ms @95 23.9ms @100 > > This isn't even really the fastest we can make 8-bit go on ARMv8; > it's actually much more natural to work de-interlaced there. Lots > of room to follow up. > > Change-Id: I86b1099f6742bcb0b8b4fa153e85eaba9567cbf7 > Reviewed-on: https://skia-review.googlesource.com/39740 > Reviewed-by: Florin Malita <fmalita@chromium.org> > Commit-Queue: Mike Klein <mtklein@chromium.org> TBR=mtklein@chromium.org,herb@google.com,fmalita@chromium.org,reed@google.com Change-Id: I71425d8b7fbb66be5cb50025871dd81358111da4 No-Presubmit: true No-Tree-Checks: true No-Try: true Reviewed-on: https://skia-review.googlesource.com/39980 Reviewed-by: Derek Sollenberger <djsollen@google.com> Commit-Queue: Derek Sollenberger <djsollen@google.com>
author: Derek Sollenberger <djsollen@google.com> 2017-08-29 12:37:50 +0000
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2017-08-29 12:38:02 +0000
commit: 6d13575108299951ecdfba6d85c915fcec2bc028 (patch)
tree: 0cae7b1da7b606d9f68b4263a51dfe5495285a72
parent: 6b47c7d19fcc95d2c3dbce582a8d68bb3bf6ba2a (diff)
4 files changed, 22 insertions, 61 deletions
diff --git a/gn/core.gni b/gn/core.gni
index 5cb3e094b2..65b08db282 100644
--- a/gn/core.gni
+++ b/gn/core.gni
@@ -515,7 +515,6 @@ skia_core_sources = [
 skia_core_sources += [
   "$_src/jumper/SkJumper.cpp",
   "$_src/jumper/SkJumper_stages.cpp",
-  "$_src/jumper/SkJumper_stages_8bit.cpp",
 ]
 if (is_win) {
   skia_core_sources += [ "$_src/jumper/SkJumper_generated_win.S" ]
diff --git a/public.bzl b/public.bzl
index 8a37141467..80042730c9 100644
--- a/public.bzl
+++ b/public.bzl
@@ -116,6 +116,9 @@ BASE_SRCS_ALL = struct(
 
         # Defines main.
         "src/sksl/SkSLMain.cpp",
+
+        # Only pre-compiled into SkJumper_generated.S.
+        "src/jumper/SkJumper_stages_8bit.cpp",
     ],
 )
 
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 315110faf2..9f8e970f32 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -110,7 +110,7 @@ using StartPipelineFn = void(size_t,size_t,size_t,size_t, void**,K*);
 extern "C" {
 
 #if __has_feature(memory_sanitizer)
-    // We'll just run baseline code.
+    // We'll just run portable code.
 
 #elif defined(__arm__)
     StartPipelineFn ASM(start_pipeline,vfp4);
@@ -168,22 +168,12 @@ extern "C" {
 
 #endif
 
-    // Baseline code compiled as a normal part of Skia.
+    // Portable, single-pixel stages.
     StartPipelineFn sk_start_pipeline;
     StageFn sk_just_return;
     #define M(st) StageFn sk_##st;
         SK_RASTER_PIPELINE_STAGES(M)
     #undef M
-
-#if defined(__clang__) && defined(__aarch64__)
-    // We also compile 8-bit stages on ARMv8 as a normal part of Skia when compiled with Clang.
-    StartPipelineFn sk_start_pipeline_8bit;
-    StageFn sk_just_return_8bit;
-    #define M(st) StageFn sk_##st##_8bit;
-        SK_RASTER_PIPELINE_STAGES(M)
-    #undef M
-#endif
-
 }
 
 #if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
@@ -208,16 +198,6 @@ extern "C" {
         }
         LOWP_STAGES(M)
     #undef M
-#elif defined(__clang__) && defined(__aarch64__)
-    template <SkRasterPipeline::StockStage st>
-    static constexpr StageFn* aarch64_8bit() { return nullptr; }
-
-    #define M(st)                                                               \
-        template <> constexpr StageFn* aarch64_8bit<SkRasterPipeline::st>() {   \
-            return sk_##st##_8bit;                                              \
-        }
-        LOWP_STAGES(M)
-    #undef M
 #endif
 
 // Engines comprise everything we need to run SkRasterPipelines.
@@ -227,20 +207,20 @@ struct SkJumper_Engine {
     StageFn*         just_return;
 };
 
-// We'll default to this baseline engine, but try to choose a better one at runtime.
-static const SkJumper_Engine kBaseline = {
+// We'll default to this portable engine, but try to choose a better one at runtime.
+static const SkJumper_Engine kPortable = {
 #define M(stage) sk_##stage,
     { SK_RASTER_PIPELINE_STAGES(M) },
 #undef M
     sk_start_pipeline,
     sk_just_return,
 };
-static SkJumper_Engine gEngine = kBaseline;
+static SkJumper_Engine gEngine = kPortable;
 static SkOnce gChooseEngineOnce;
 
 static SkJumper_Engine choose_engine() {
 #if __has_feature(memory_sanitizer)
-    // We'll just run baseline code.
+    // We'll just run portable code.
 
 #elif defined(__arm__)
     if (1 && SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
@@ -303,7 +283,7 @@ static SkJumper_Engine choose_engine() {
     }
 
 #endif
-    return kBaseline;
+    return kPortable;
 }
 
 #ifndef SK_JUMPER_DISABLE_8BIT
@@ -346,14 +326,6 @@ static SkJumper_Engine choose_engine() {
             #undef M
             };
         }
-    #elif defined(__clang__) && defined(__aarch64__)
-        return {
-        #define M(st) aarch64_8bit<SkRasterPipeline::st>(),
-            { SK_RASTER_PIPELINE_STAGES(M) },
-            sk_start_pipeline_8bit,
-            sk_just_return_8bit,
-        #undef M
-        };
     #endif
         return kNone;
     }
diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp
index edd6689c8c..5c73ea8cbe 100644
--- a/src/jumper/SkJumper_stages_8bit.cpp
+++ b/src/jumper/SkJumper_stages_8bit.cpp
@@ -5,27 +5,23 @@
  * found in the LICENSE file.
  */
 
-// This restricted SkJumper backend works on 8-bit per channel interlaced
-// pixels.  This is the natural format for kN32_SkColorType buffers, and we
-// hope the stages in this file can replace many custom legacy routines.
-
 #include "SkJumper.h"
 #include "SkJumper_misc.h"
 
-// As an experiment we bake ARMv8 8-bit code in as normally compiled Skia code.
-// Any other platform (so far) is offline-only.
-#if defined(JUMPER_IS_OFFLINE) || (defined(__clang__) && defined(__aarch64__))
-
-#if defined(__aarch64__)
-    #include <arm_neon.h>
-#else
+#if defined(__SSE2__)
     #include <immintrin.h>
 #endif
 
+// This restricted SkJumper backend works on 8-bit per channel interlaced
+// pixels.  This is the natural format for kN32_SkColorType buffers, and we
+// hope the stages in this file can replace many custom legacy routines.
+
 #if !defined(JUMPER_IS_OFFLINE)
-    #define WRAP(name) sk_##name##_8bit
+    #error "This file must be pre-compiled."
 #elif defined(__aarch64__)
     #define WRAP(name) sk_##name##_aarch64_8bit
+#elif defined(__arm__)
+    #define WRAP(name) sk_##name##_vfp4_8bit
 #elif defined(__AVX2__)
     #define WRAP(name) sk_##name##_hsw_8bit
 #elif defined(__SSE4_1__)
@@ -116,7 +112,7 @@ SI V operator*(V x, V y) {
 
 template <typename T>
 SI T inv(T v) { return 0xff - v; }
-
+SI V two(V v) { return v + v; }
 SI V lerp(V from, V to, V t) { return to*t + from*inv(t); }
 
 SI V alpha(V v) {
@@ -166,13 +162,10 @@ SI V saturated_add(V a, V b) {
       b_lo, b_hi;
     split(a.u8x4, &a_lo, &a_hi);
     split(b.u8x4, &b_lo, &b_hi);
-#if defined(__aarch64__)
-    return join(vqaddq_u8(a_lo, b_lo),
-                vqaddq_u8(a_hi, b_hi));
-#elif defined(__AVX2__)
+#if defined(__AVX2__)
     return join(_mm256_adds_epu8(a_lo, b_lo),
                 _mm256_adds_epu8(a_hi, b_hi));
-#elif defined(__SSE2__)
+#else
     return join(_mm_adds_epu8(a_lo, b_lo),
                 _mm_adds_epu8(a_hi, b_hi));
 #endif
@@ -192,11 +185,7 @@ using Stage = void(const Params* params, void** program, R src_lo, R src_hi, R d
 MAYBE_MSABI
 extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit,
                                      void** program, const SkJumper_constants*) {
-#if defined(JUMPER_IS_OFFLINE)
-    R r;      // Fastest to start uninitialized.
-#else
-    R r{};    // Next best is zero'd for compilers that will complain about uninitialized values.
-#endif
+    R r;
     auto start = (Stage*)load_and_inc(program);
     for (; y < ylimit; y++) {
         Params params = { x,y,0 };
@@ -472,5 +461,3 @@ STAGE(overlay) {
 //   colorburn  |
 //   colordodge  > these involve division, which makes them (much) slower than the float stages.
 //   softlight  |
-
-#endif
author	Derek Sollenberger <djsollen@google.com>	2017-08-29 12:37:50 +0000
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2017-08-29 12:38:02 +0000
commit	6d13575108299951ecdfba6d85c915fcec2bc028 (patch)
tree	0cae7b1da7b606d9f68b4263a51dfe5495285a72
parent	6b47c7d19fcc95d2c3dbce582a8d68bb3bf6ba2a (diff)