some armv7 hacking

We can splice these stages if we drop them down to 2 at a time. Turns out this is significantly (2-3x) faster than the status quo. SkRasterPipeline_… …f16_compile 1x …srgb_compile 2.06x …f16_run 3.08x …srgb_run 4.61x Added a couple ways to detect (likely) the required VFPv4 support: - use hwcap when available (NDK ≥21, Android framework) - use cpu-features when not (NDK <21) The code in SkSplicer_generated.h is ARM, not Thumb2. SkSplicer seems to be blx'ing into it, so that's great, and we bx lr out. There's no point in attempting to use Thumb2 in vector heavy code... it'll all be 4 byte anyway. Follow ups: - vpush {d8-d9} before the loop, vpop {d8-d9} afterwards, skip these instructions when splicing; - (probably) drop jumping stages down to 2-at-a-time also. Change-Id: If151394ec10e8cbd6a05e2d81808488d743bfe15 Reviewed-on: https://skia-review.googlesource.com/6940 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
author: Mike Klein <mtklein@chromium.org> 2017-01-12 11:36:46 -0500
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2017-01-13 17:25:15 +0000
commit: 4ef8cb3527b7e3f453dccd39eea76e31eb2c33c7 (patch)
tree: bbb69e2f6aa3113192508451a8f99f96eefc8e07 /src/splicer/SkSplicer_stages.cpp
parent: 70b49fd063171a78d3c664ca8af3988f5426319b (diff)
1 files changed, 47 insertions, 7 deletions
diff --git a/src/splicer/SkSplicer_stages.cpp b/src/splicer/SkSplicer_stages.cpp
index e3a19ea5a8..c45f204e22 100644
--- a/src/splicer/SkSplicer_stages.cpp
+++ b/src/splicer/SkSplicer_stages.cpp
@@ -25,13 +25,34 @@
     using U8  = uint8_t  __attribute__((ext_vector_type(4)));
 
     // We polyfill a few routines that Clang doesn't build into ext_vector_types.
-    AI static U32 round(F v)                           { return vcvtnq_u32_f32(v);       }
     AI static F   min(F a, F b)                        { return vminq_f32(a,b);          }
     AI static F   max(F a, F b)                        { return vmaxq_f32(a,b);          }
     AI static F   fma(F f, F m, F a)                   { return vfmaq_f32(a,f,m);        }
     AI static F   rcp  (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e  ) * e; }
     AI static F   rsqrt(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
     AI static F   if_then_else(I32 c, F t, F e)        { return vbslq_f32((U32)c,t,e);   }
+    AI static U32 round(F v, F scale)                  { return vcvtnq_u32_f32(v*scale); }
+
+#elif defined(__ARM_NEON__)
+    #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
+        #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
+    #endif
+    #include <arm_neon.h>
+
+    // We can pass {s0-s15} as arguments under AAPCS-VFP.  We'll slice that as 8 d-registers.
+    using F   = float    __attribute__((ext_vector_type(2)));
+    using I32 =  int32_t __attribute__((ext_vector_type(2)));
+    using U32 = uint32_t __attribute__((ext_vector_type(2)));
+    using U8  = uint8_t  __attribute__((ext_vector_type(2)));
+
+    AI static F   min(F a, F b)                        { return vmin_f32(a,b);          }
+    AI static F   max(F a, F b)                        { return vmax_f32(a,b);          }
+    AI static F   fma(F f, F m, F a)                   { return vfma_f32(a,f,m);        }
+    AI static F   rcp  (F v)  { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e  ) * e; }
+    AI static F   rsqrt(F v)  { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
+    AI static F   if_then_else(I32 c, F t, F e)        { return vbsl_f32((U32)c,t,e);   }
+    AI static U32 round(F v, F scale)                  { return vcvt_u32_f32(fma(v,scale,0.5f)); }
+
 #else
     #if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
         #error On x86, compile with -mavx2 -mfma -mf16c.
@@ -44,13 +65,13 @@
     using U32 = uint32_t __attribute__((ext_vector_type(8)));
     using U8  = uint8_t  __attribute__((ext_vector_type(8)));
 
-    AI static U32 round(F v)                    { return _mm256_cvtps_epi32(v); }
     AI static F   min(F a, F b)                 { return _mm256_min_ps  (a,b);  }
     AI static F   max(F a, F b)                 { return _mm256_max_ps  (a,b);  }
     AI static F   fma(F f, F m, F a)            { return _mm256_fmadd_ps(f,m,a);}
     AI static F   rcp  (F v)                    { return _mm256_rcp_ps     (v); }
     AI static F   rsqrt(F v)                    { return _mm256_rsqrt_ps   (v); }
     AI static F   if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
+    AI static U32 round(F v, F scale)           { return _mm256_cvtps_epi32(v*scale); }
 #endif
 
 AI static F   cast  (U32 v) { return __builtin_convertvector((I32)v, F);   }
@@ -58,7 +79,12 @@ AI static U32 expand(U8  v) { return __builtin_convertvector(     v, U32); }
 
 // We'll be compiling this file to an object file, then extracting parts of it into
 // SkSplicer_generated.h.  It's easier to do if the function names are not C++ mangled.
-#define C extern "C"
+// On ARMv7, use aapcs-vfp calling convention to pass as much data in registers as possible.
+#if defined(__ARM_NEON__)
+    #define C extern "C" __attribute__((pcs("aapcs-vfp")))
+#else
+    #define C extern "C"
+#endif
 
 // Stages all fit a common interface that allows SkSplicer to splice them together.
 using K = const SkSplicer_constants;
@@ -240,10 +266,10 @@ STAGE(load_8888) {
 STAGE(store_8888) {
     auto ptr = *(uint32_t**)ctx + x;
 
-    U32 px = round(r * k->_255)
-           | round(g * k->_255) <<  8
-           | round(b * k->_255) << 16
-           | round(a * k->_255) << 24;
+    U32 px = round(r, k->_255)
+           | round(g, k->_255) <<  8
+           | round(b, k->_255) << 16
+           | round(a, k->_255) << 24;
     memcpy(ptr, &px, sizeof(px));
 }
 
@@ -256,6 +282,14 @@ STAGE(load_f16) {
     g = vcvt_f32_f16(halfs.val[1]);
     b = vcvt_f32_f16(halfs.val[2]);
     a = vcvt_f32_f16(halfs.val[3]);
+#elif defined(__ARM_NEON__)
+    auto rb_ga = vld2_f16((const float16_t*)ptr);
+    auto rb = vcvt_f32_f16(rb_ga.val[0]),
+         ga = vcvt_f32_f16(rb_ga.val[1]);
+    r = {rb[0], rb[2]};
+    g = {ga[0], ga[2]};
+    b = {rb[1], rb[3]};
+    a = {ga[1], ga[3]};
 #else
     auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
          _23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
@@ -290,6 +324,12 @@ STAGE(store_f16) {
         vcvt_f16_f32(a),
     }};
     vst4_f16((float16_t*)ptr, halfs);
+#elif defined(__ARM_NEON__)
+    float16x4x2_t rb_ga = {{
+        vcvt_f16_f32(float32x4_t{r[0], b[0], r[1], b[1]}),
+        vcvt_f16_f32(float32x4_t{g[0], a[0], g[1], a[1]}),
+    }};
+    vst2_f16((float16_t*)ptr, rb_ga);
 #else
     auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION),
          G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION),
author	Mike Klein <mtklein@chromium.org>	2017-01-12 11:36:46 -0500
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2017-01-13 17:25:15 +0000
commit	4ef8cb3527b7e3f453dccd39eea76e31eb2c33c7 (patch)
tree	bbb69e2f6aa3113192508451a8f99f96eefc8e07 /src/splicer/SkSplicer_stages.cpp
parent	70b49fd063171a78d3c664ca8af3988f5426319b (diff)