use NEON 8-bit stages on ARMv7 too

We don't really use anything very ARMv8 specific in the 8-bit NEON stages, so we can just naturally extend what we're doing to ARMv7 too. Note that unlike the float stages, we're not requiring VFPv4 either, just NEON. VFPv4 is for FMA and F16<->F32 conversion, both of which are unnecessary for the integer pipeline. GMs and perf improvement are similar to the previous ARMv8 change. Change-Id: Id618801ea1920564c1deee144a640a4133c4505f Reviewed-on: https://skia-review.googlesource.com/39840 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Mike Klein <mtklein@chromium.org> Reviewed-by: Herb Derby <herb@google.com>
author: Mike Klein <mtklein@chromium.org> 2017-08-28 17:53:34 -0400
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2017-08-29 18:27:51 +0000
commit: b561b764d894260b77d3c44f8fa182802897f2e1 (patch)
tree: 3d3e22e7f0c76bbb7775dba0e566aee28d3322a2 /src/jumper
parent: fe75930ce0b8d9451d29162942badfd568a1ec47 (diff)
4 files changed, 22 insertions, 20 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 315110faf2..8f3e6a749f 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -175,7 +175,7 @@ extern "C" {
         SK_RASTER_PIPELINE_STAGES(M)
     #undef M
 
-#if defined(__clang__) && defined(__aarch64__)
+#if defined(JUMPER_HAS_NEON_8BIT)
     // We also compile 8-bit stages on ARMv8 as a normal part of Skia when compiled with Clang.
     StartPipelineFn sk_start_pipeline_8bit;
     StageFn sk_just_return_8bit;
@@ -208,13 +208,13 @@ extern "C" {
         }
         LOWP_STAGES(M)
     #undef M
-#elif defined(__clang__) && defined(__aarch64__)
+#elif defined(JUMPER_HAS_NEON_8BIT)
     template <SkRasterPipeline::StockStage st>
-    static constexpr StageFn* aarch64_8bit() { return nullptr; }
+    static constexpr StageFn* neon_8bit() { return nullptr; }
 
-    #define M(st)                                                               \
-        template <> constexpr StageFn* aarch64_8bit<SkRasterPipeline::st>() {   \
-            return sk_##st##_8bit;                                              \
+    #define M(st)                                                            \
+        template <> constexpr StageFn* neon_8bit<SkRasterPipeline::st>() {   \
+            return sk_##st##_8bit;                                           \
         }
         LOWP_STAGES(M)
     #undef M
@@ -346,9 +346,9 @@ static SkJumper_Engine choose_engine() {
             #undef M
             };
         }
-    #elif defined(__clang__) && defined(__aarch64__)
+    #elif defined(JUMPER_HAS_NEON_8BIT)
         return {
-        #define M(st) aarch64_8bit<SkRasterPipeline::st>(),
+        #define M(st) neon_8bit<SkRasterPipeline::st>(),
             { SK_RASTER_PIPELINE_STAGES(M) },
             sk_start_pipeline_8bit,
             sk_just_return_8bit,
diff --git a/src/jumper/SkJumper.h b/src/jumper/SkJumper.h
index 20b8d32aba..4bb851f939 100644
--- a/src/jumper/SkJumper.h
+++ b/src/jumper/SkJumper.h
@@ -50,6 +50,15 @@
     #include <stdint.h>
 #endif
 
+// When compiled with Clang on ARM, we'll have 8-bit NEON stages.
+#if defined(__clang__)
+    #if defined(__aarch64__)
+        #define JUMPER_HAS_NEON_8BIT
+    #elif defined(__arm__) && defined(__ARM_NEON__)
+        #define JUMPER_HAS_NEON_8BIT
+    #endif
+#endif
+
 static const int SkJumper_kMaxStride = 8;
 
 struct SkJumper_constants {
diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp
index 0c019f8fbc..b6d94e3bed 100644
--- a/src/jumper/SkJumper_stages_8bit.cpp
+++ b/src/jumper/SkJumper_stages_8bit.cpp
@@ -14,9 +14,9 @@
 
 // As an experiment we bake ARMv8 8-bit code in as normally compiled Skia code.
 // Any other platform (so far) is offline-only.
-#if defined(JUMPER_IS_OFFLINE) || (defined(__clang__) && defined(__aarch64__))
+#if defined(JUMPER_IS_OFFLINE) || defined(JUMPER_HAS_NEON_8BIT)
 
-#if defined(__aarch64__)
+#if defined(JUMPER_HAS_NEON_8BIT)
     #include <arm_neon.h>
 #else
     #include <immintrin.h>
@@ -24,8 +24,6 @@
 
 #if !defined(JUMPER_IS_OFFLINE)
     #define WRAP(name) sk_##name##_8bit
-#elif defined(__aarch64__)
-    #define WRAP(name) sk_##name##_aarch64_8bit
 #elif defined(__AVX2__)
     #define WRAP(name) sk_##name##_hsw_8bit
 #elif defined(__SSE4_1__)
@@ -166,7 +164,7 @@ SI V saturated_add(V a, V b) {
       b_lo, b_hi;
     split(a.u8x4, &a_lo, &a_hi);
     split(b.u8x4, &b_lo, &b_hi);
-#if defined(__aarch64__)
+#if defined(JUMPER_HAS_NEON_8BIT)
     return join(vqaddq_u8(a_lo, b_lo),
                 vqaddq_u8(a_hi, b_hi));
 #elif defined(__AVX2__)
diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py
index 688ad60706..728b0a51c4 100755
--- a/src/jumper/build_stages.py
+++ b/src/jumper/build_stages.py
@@ -109,10 +109,6 @@ vfp4 = [
 subprocess.check_call(clang + cflags + vfp4 +
                       ['-c', stages] +
                       ['-o', 'vfp4.o'])
-# TODO: should work fine... I just want to turn this one on separately from x86
-#subprocess.check_call(clang + cflags + vfp4 +
-#                      ['-c', stages_8bit] +
-#                      ['-o', '8bit_vfp4.o'])
 
 def parse_object_file(dot_o, directive, target=None):
   globl, hidden, label, comment, align = \
@@ -223,12 +219,11 @@ print '#endif'
 print '.text'
 print '#if defined(__arm__)'
 print 'BALIGN4'
-parse_object_file(     'vfp4.o', '.long', target='elf32-littlearm')
-#parse_object_file('8bit_vfp4.o', '.long', target='elf32-littlearm')
+parse_object_file('vfp4.o', '.long', target='elf32-littlearm')
 
 print '#elif defined(__x86_64__)'
 print 'BALIGN32'
-parse_object_file('merged.o',   '.byte')
+parse_object_file('merged.o', '.byte')
 
 print '#elif defined(__i386__)'
 print 'BALIGN32'
author	Mike Klein <mtklein@chromium.org>	2017-08-28 17:53:34 -0400
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2017-08-29 18:27:51 +0000
commit	b561b764d894260b77d3c44f8fa182802897f2e1 (patch)
tree	3d3e22e7f0c76bbb7775dba0e566aee28d3322a2 /src/jumper
parent	fe75930ce0b8d9451d29162942badfd568a1ec47 (diff)