SkSplicer: begin on sse2/sse4.1 support

This lets us target older machines with SkSplicer. SSE2 and SSE4.1 are the sweet spots if we're going to pick two more. Nothing too interesting here except maybe the f16<->f32 code. I rearranged a little to keep things consistent across platforms. Next CL will get this into _generated.h and use it when appropriate. Change-Id: Ibbdc61ea7a45d22b4f4058b01f75161ea74a7726 Reviewed-on: https://skia-review.googlesource.com/8193 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
author: Mike Klein <mtklein@chromium.org> 2017-02-08 12:50:17 -0500
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2017-02-08 21:07:05 +0000
commit: 16c149664d3f95e1cabded8a1b7b3d105222c236 (patch)
tree: 87b89d4665f7936aad322d492c571f34ee83da72
parent: 30ec0b3735d5f728c2aea4184736a3e286a5ccda (diff)
4 files changed, 94 insertions, 22 deletions
diff --git a/src/splicer/SkSplicer.cpp b/src/splicer/SkSplicer.cpp
index 6a0fc3cdb7..cba7b2adf0 100644
--- a/src/splicer/SkSplicer.cpp
+++ b/src/splicer/SkSplicer.cpp
@@ -41,6 +41,7 @@ namespace {
         1.0f, 255.0f, 1/255.0f, 0x000000ff,
         0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f,       // from_srgb
         12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f,   //   to_srgb
+        0x77800000, 0x07800000,                            // fp16 <-> fp32
     };
 
     // We do this a lot, so it's nice to infer the correct size.  Works fine with arrays.
diff --git a/src/splicer/SkSplicer_shared.h b/src/splicer/SkSplicer_shared.h
index 6a8f14c54d..0ad0a09478 100644
--- a/src/splicer/SkSplicer_shared.h
+++ b/src/splicer/SkSplicer_shared.h
@@ -38,6 +38,10 @@ struct SkSplicer_constants {
     float    _0689206;     //  0.689206f
     float   n_00988;       // -0.0988f
     float    _00043;       //  0.0043f
+
+    // fp16 <-> fp32
+    uint32_t _0x77800000;
+    uint32_t _0x07800000;
 };
 
 #endif//SkSplicer_shared_DEFINED
diff --git a/src/splicer/SkSplicer_stages.cpp b/src/splicer/SkSplicer_stages.cpp
index 9c5a442069..bff58c2518 100644
--- a/src/splicer/SkSplicer_stages.cpp
+++ b/src/splicer/SkSplicer_stages.cpp
@@ -25,14 +25,15 @@ using K = const SkSplicer_constants;
     using U8  = uint8_t  __attribute__((ext_vector_type(4)));
 
     // We polyfill a few routines that Clang doesn't build into ext_vector_types.
+    static F   fma(F f, F m, F a)                   { return vfmaq_f32(a,f,m);        }
     static F   min(F a, F b)                        { return vminq_f32(a,b);          }
     static F   max(F a, F b)                        { return vmaxq_f32(a,b);          }
-    static F   fma(F f, F m, F a)                   { return vfmaq_f32(a,f,m);        }
     static F   rcp  (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e  ) * e; }
     static F   rsqrt(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
-    static F   if_then_else(I32 c, F t, F e)        { return vbslq_f32((U32)c,t,e);   }
     static U32 round(F v, F scale)                  { return vcvtnq_u32_f32(v*scale); }
 
+    static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
+
     static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
 #elif defined(__ARM_NEON__)
     #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
@@ -46,19 +47,17 @@ using K = const SkSplicer_constants;
     using U32 = uint32_t __attribute__((ext_vector_type(2)));
     using U8  = uint8_t  __attribute__((ext_vector_type(2)));
 
-    static F   min(F a, F b)                        { return vmin_f32(a,b);          }
-    static F   max(F a, F b)                        { return vmax_f32(a,b);          }
-    static F   fma(F f, F m, F a)                   { return vfma_f32(a,f,m);        }
-    static F   rcp  (F v)  { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e  ) * e; }
-    static F   rsqrt(F v)  { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
-    static F   if_then_else(I32 c, F t, F e)        { return vbsl_f32((U32)c,t,e);   }
-    static U32 round(F v, F scale)                  { return vcvt_u32_f32(fma(v,scale,0.5f)); }
+    static F   fma(F f, F m, F a)                  { return vfma_f32(a,f,m);        }
+    static F   min(F a, F b)                       { return vmin_f32(a,b);          }
+    static F   max(F a, F b)                       { return vmax_f32(a,b);          }
+    static F   rcp  (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e  ) * e; }
+    static F   rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
+    static U32 round(F v, F scale)                 { return vcvt_u32_f32(fma(v,scale,0.5f)); }
+
+    static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
 
     static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
-#else
-    #if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
-        #error On x86, compile with -mavx2 -mfma -mf16c.
-    #endif
+#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
     #include <immintrin.h>
 
     // These are __m256 and __m256i, but friendlier and strongly-typed.
@@ -67,15 +66,40 @@ using K = const SkSplicer_constants;
     using U32 = uint32_t __attribute__((ext_vector_type(8)));
     using U8  = uint8_t  __attribute__((ext_vector_type(8)));
 
-    static F   min(F a, F b)                 { return _mm256_min_ps  (a,b);  }
-    static F   max(F a, F b)                 { return _mm256_max_ps  (a,b);  }
-    static F   fma(F f, F m, F a)            { return _mm256_fmadd_ps(f,m,a);}
-    static F   rcp  (F v)                    { return _mm256_rcp_ps     (v); }
-    static F   rsqrt(F v)                    { return _mm256_rsqrt_ps   (v); }
-    static F   if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
-    static U32 round(F v, F scale)           { return _mm256_cvtps_epi32(v*scale); }
+    static F   fma(F f, F m, F a)  { return _mm256_fmadd_ps(f,m,a);}
+    static F   min(F a, F b)       { return _mm256_min_ps(a,b);    }
+    static F   max(F a, F b)       { return _mm256_max_ps(a,b);    }
+    static F   rcp  (F v)          { return _mm256_rcp_ps  (v);    }
+    static F   rsqrt(F v)          { return _mm256_rsqrt_ps(v);    }
+    static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
+
+    static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
 
     static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); }
+#elif defined(__SSE2__)
+    #include <immintrin.h>
+
+    using F   = float    __attribute__((ext_vector_type(4)));
+    using I32 =  int32_t __attribute__((ext_vector_type(4)));
+    using U32 = uint32_t __attribute__((ext_vector_type(4)));
+    using U8  = uint8_t  __attribute__((ext_vector_type(4)));
+
+    static F   fma(F f, F m, F a)  { return f*m+a;           }
+    static F   min(F a, F b)       { return _mm_min_ps(a,b); }
+    static F   max(F a, F b)       { return _mm_max_ps(a,b); }
+    static F   rcp  (F v)          { return _mm_rcp_ps  (v); }
+    static F   rsqrt(F v)          { return _mm_rsqrt_ps(v); }
+    static U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
+
+    static F if_then_else(I32 c, F t, F e) {
+    #if defined(__SSE4_1__)
+        return _mm_blendv_ps(e,t,c);
+    #else
+        return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e));
+    #endif
+    }
+
+    static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
 #endif
 
 static F   cast  (U32 v) { return __builtin_convertvector((I32)v, F);   }
@@ -310,7 +334,7 @@ STAGE(load_f16) {
     g = {ga[0], ga[2]};
     b = {rb[1], rb[3]};
     a = {ga[1], ga[3]};
-#else
+#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
     auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
          _23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
          _45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
@@ -330,6 +354,25 @@ STAGE(load_f16) {
     g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567));
     b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
     a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
+#elif defined(__SSE2__)
+    auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
+         _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
+
+    auto _02 = _mm_unpacklo_epi16(_01, _23),  // r0 r2 g0 g2 b0 b2 a0 a2
+         _13 = _mm_unpackhi_epi16(_01, _23);  // r1 r3 g1 g3 b1 b3 a1 a3
+
+    auto rg = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
+         ba = _mm_unpackhi_epi16(_02, _13);  // b0 b1 b2 b3 a0 a1 a2 a3
+
+    auto half_to_float = [&](U32 h) {
+        return (F)(h << 13)             // Line up the mantissa,
+             * (F)U32(k->_0x77800000);  // then fix up the exponent.
+    };
+
+    r = half_to_float(_mm_unpacklo_epi16(rg, _mm_setzero_si128()));
+    g = half_to_float(_mm_unpackhi_epi16(rg, _mm_setzero_si128()));
+    b = half_to_float(_mm_unpacklo_epi16(ba, _mm_setzero_si128()));
+    a = half_to_float(_mm_unpackhi_epi16(ba, _mm_setzero_si128()));
 #endif
 }
 
@@ -350,7 +393,7 @@ STAGE(store_f16) {
         vcvt_f16_f32(float32x4_t{g[0], a[0], g[1], a[1]}),
     }};
     vst2_f16((float16_t*)ptr, rb_ga);
-#else
+#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
     auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION),
          G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION),
          B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION),
@@ -365,6 +408,19 @@ STAGE(store_f16) {
     _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
     _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
     _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
+#elif defined(__SSE2__)
+    auto float_to_half = [&](F f) {
+        return (U32)(f * (F)U32(k->_0x07800000)) // Fix up the exponent,
+            >> 13;                               // then line up the mantissa.
+    };
+    U32 R = float_to_half(r),
+        G = float_to_half(g),
+        B = float_to_half(b),
+        A = float_to_half(a);
+    U32 rg = R | _mm_slli_si128(G,2),
+        ba = B | _mm_slli_si128(A,2);
+    _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
+    _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
 #endif
 }
 
diff --git a/src/splicer/build_stages.py b/src/splicer/build_stages.py
index 900b47ff6d..a4fd97db4b 100755
--- a/src/splicer/build_stages.py
+++ b/src/splicer/build_stages.py
@@ -17,6 +17,17 @@ objdump = 'gobjdump'
 
 cflags = '-std=c++11 -Os -fomit-frame-pointer'.split()
 
+sse2 = '-msse2 -mno-sse3 -mno-ssse3 -mno-sse4.1'.split()
+subprocess.check_call(['clang++'] + cflags + sse2 +
+                      ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
+                      ['-o', 'sse2.o'])
+
+sse41 = '-msse4.1'.split()
+subprocess.check_call(['clang++'] + cflags + sse41 +
+                      ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
+                      ['-o', 'sse41.o'])
+
+
 hsw = '-mavx2 -mfma -mf16c'.split()
 subprocess.check_call(['clang++'] + cflags + hsw +
                       ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
author	Mike Klein <mtklein@chromium.org>	2017-02-08 12:50:17 -0500
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2017-02-08 21:07:05 +0000
commit	16c149664d3f95e1cabded8a1b7b3d105222c236 (patch)
tree	87b89d4665f7936aad322d492c571f34ee83da72
parent	30ec0b3735d5f728c2aea4184736a3e286a5ccda (diff)