1 files changed, 29 insertions, 12 deletions
diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp
index 5c73ea8cbe..0c019f8fbc 100644
--- a/src/jumper/SkJumper_stages_8bit.cpp
+++ b/src/jumper/SkJumper_stages_8bit.cpp
@@ -5,23 +5,27 @@
  * found in the LICENSE file.
  */
 
+// This restricted SkJumper backend works on 8-bit per channel interlaced
+// pixels.  This is the natural format for kN32_SkColorType buffers, and we
+// hope the stages in this file can replace many custom legacy routines.
+
 #include "SkJumper.h"
 #include "SkJumper_misc.h"
 
-#if defined(__SSE2__)
+// As an experiment we bake ARMv8 8-bit code in as normally compiled Skia code.
+// Any other platform (so far) is offline-only.
+#if defined(JUMPER_IS_OFFLINE) || (defined(__clang__) && defined(__aarch64__))
+
+#if defined(__aarch64__)
+    #include <arm_neon.h>
+#else
     #include <immintrin.h>
 #endif
 
-// This restricted SkJumper backend works on 8-bit per channel interlaced
-// pixels.  This is the natural format for kN32_SkColorType buffers, and we
-// hope the stages in this file can replace many custom legacy routines.
-
 #if !defined(JUMPER_IS_OFFLINE)
-    #error "This file must be pre-compiled."
+    #define WRAP(name) sk_##name##_8bit
 #elif defined(__aarch64__)
     #define WRAP(name) sk_##name##_aarch64_8bit
-#elif defined(__arm__)
-    #define WRAP(name) sk_##name##_vfp4_8bit
 #elif defined(__AVX2__)
     #define WRAP(name) sk_##name##_hsw_8bit
 #elif defined(__SSE4_1__)
@@ -112,7 +116,7 @@ SI V operator*(V x, V y) {
 
 template <typename T>
 SI T inv(T v) { return 0xff - v; }
-SI V two(V v) { return v + v; }
+
 SI V lerp(V from, V to, V t) { return to*t + from*inv(t); }
 
 SI V alpha(V v) {
@@ -162,10 +166,13 @@ SI V saturated_add(V a, V b) {
       b_lo, b_hi;
     split(a.u8x4, &a_lo, &a_hi);
     split(b.u8x4, &b_lo, &b_hi);
-#if defined(__AVX2__)
+#if defined(__aarch64__)
+    return join(vqaddq_u8(a_lo, b_lo),
+                vqaddq_u8(a_hi, b_hi));
+#elif defined(__AVX2__)
     return join(_mm256_adds_epu8(a_lo, b_lo),
                 _mm256_adds_epu8(a_hi, b_hi));
-#else
+#elif defined(__SSE2__)
     return join(_mm_adds_epu8(a_lo, b_lo),
                 _mm_adds_epu8(a_hi, b_hi));
 #endif
@@ -185,7 +192,11 @@ using Stage = void(const Params* params, void** program, R src_lo, R src_hi, R d
 MAYBE_MSABI
 extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit,
                                      void** program, const SkJumper_constants*) {
-    R r;
+#if defined(JUMPER_IS_OFFLINE)
+    R r;      // Fastest to start uninitialized.
+#else
+    R r{};    // Next best is zero'd for compilers that will complain about uninitialized values.
+#endif
     auto start = (Stage*)load_and_inc(program);
     for (; y < ylimit; y++) {
         Params params = { x,y,0 };
@@ -223,6 +234,7 @@ SI V load(const T* src, size_t tail) {
     if (__builtin_expect(tail, 0)) {
         V v = 0;
         switch (tail) {
+        #if defined(__AVX2__)
             case 15: v[14] = src[14];
             case 14: v[13] = src[13];
             case 13: v[12] = src[12];
@@ -231,6 +243,7 @@ SI V load(const T* src, size_t tail) {
             case 10: v[ 9] = src[ 9];
             case  9: v[ 8] = src[ 8];
             case  8: memcpy(&v, src, 8*sizeof(T)); break;
+        #endif
             case  7: v[6] = src[6];
             case  6: v[5] = src[5];
             case  5: v[4] = src[4];
@@ -249,6 +262,7 @@ SI void store(T* dst, V v, size_t tail) {
     __builtin_assume(tail < kStride);
     if (__builtin_expect(tail, 0)) {
         switch (tail) {
+        #if defined(__AVX2__)
             case 15: dst[14] = v[14];
             case 14: dst[13] = v[13];
             case 13: dst[12] = v[12];
@@ -257,6 +271,7 @@ SI void store(T* dst, V v, size_t tail) {
             case 10: dst[ 9] = v[ 9];
             case  9: dst[ 8] = v[ 8];
             case  8: memcpy(dst, &v, 8*sizeof(T)); break;
+        #endif
             case  7: dst[6] = v[6];
             case  6: dst[5] = v[5];
             case  5: dst[4] = v[4];
@@ -461,3 +476,5 @@ STAGE(overlay) {
 //   colorburn  |
 //   colordodge  > these involve division, which makes them (much) slower than the float stages.
 //   softlight  |
+
+#endif