Manual byte -> float conversion.

This is a follow-up to reviews.skia.org/5540, which did float -> byte. We use the same trick here exploiting 32768.0f / 0x47000000. The benefit here is smaller than the other CL, but still measurable. The exchange here is: before: int->float, multiply after: OR, FMA The cost of an FMA is the same as a multiply, so we're basically just replacing int->float conversion with a bitwise OR. CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD Change-Id: Ieac2247664afa3ff415aec2b48c21505905bee23 Reviewed-on: https://skia-review.googlesource.com/5542 Reviewed-by: Matt Sarett <msarett@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
author: Mike Klein <mtklein@chromium.org> 2016-12-02 15:21:03 -0500
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2016-12-05 15:22:37 +0000
commit: e2e2ae23751d5a81f9cc9db8df2f5064108248aa (patch)
tree: 7b7906626977bb179d291579cdf0a2e7eec83aa2
parent: 55360b11c7da3b19e0c5ae1aa5a7a457cb5e373a (diff)
1 files changed, 25 insertions, 17 deletions
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 14f1255ec8..07f6323673 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -227,12 +227,27 @@ SI void store(size_t tail, const SkNx<N,T>& v, T* dst) {
     }
 #endif
 
+SI SkNf SkNf_fma(const SkNf& f, const SkNf& m, const SkNf& a) { return SkNx_fma(f,m,a); }
+
+SI SkNi SkNf_round(const SkNf& x, const SkNf& scale) {
+    // Every time I try, _mm_cvtps_epi32 benches as slower than using FMA and _mm_cvttps_epi32.  :/
+    return SkNx_cast<int>(SkNf_fma(x,scale, 0.5f));
+}
+
+SI SkNf SkNf_from_byte(const SkNi& x) {
+    // Same trick as in store_8888: 0x470000BB == 32768.0f + BB/256.0f for all bytes BB.
+    auto v = 0x47000000 | x;
+    // Read this as (pun_float(v) - 32768.0f) * (256/255.0f), redistributed to be an FMA.
+    return SkNf_fma(SkNf::Load(&v), 256/255.0f, -32768*256/255.0f);
+}
+SI SkNf SkNf_from_byte(const SkNu& x) { return SkNf_from_byte(SkNi::Load(&x)); }
+SI SkNf SkNf_from_byte(const SkNb& x) { return SkNf_from_byte(SkNx_cast<int>(x)); }
+
 SI void from_8888(const SkNu& _8888, SkNf* r, SkNf* g, SkNf* b, SkNf* a) {
-    auto to_float = [](const SkNu& v) { return SkNx_cast<float>(SkNi::Load(&v)); };
-    *r = (1/255.0f)*to_float((_8888 >>  0) & 0xff);
-    *g = (1/255.0f)*to_float((_8888 >>  8) & 0xff);
-    *b = (1/255.0f)*to_float((_8888 >> 16) & 0xff);
-    *a = (1/255.0f)*to_float( _8888 >> 24        );
+    *r = SkNf_from_byte((_8888      ) & 0xff);
+    *g = SkNf_from_byte((_8888 >>  8) & 0xff);
+    *b = SkNf_from_byte((_8888 >> 16) & 0xff);
+    *a = SkNf_from_byte((_8888 >> 24)       );
 }
 SI void from_4444(const SkNh& _4444, SkNf* r, SkNf* g, SkNf* b, SkNf* a) {
     auto _32_bit = SkNx_cast<int>(_4444);
@@ -250,13 +265,6 @@ SI void from_565(const SkNh& _565, SkNf* r, SkNf* g, SkNf* b) {
     *b = SkNx_cast<float>(_32_bit & SK_B16_MASK_IN_PLACE) * (1.0f / SK_B16_MASK_IN_PLACE);
 }
 
-SI SkNf SkNf_fma(const SkNf& f, const SkNf& m, const SkNf& a) { return SkNx_fma(f,m,a); }
-
-SI SkNi SkNf_round(const SkNf& x, const SkNf& scale) {
-    // Every time I try, _mm_cvtps_epi32 benches as slower than using FMA and _mm_cvttps_epi32.  :/
-    return SkNx_cast<int>(SkNf_fma(x,scale, 0.5f));
-}
-
 STAGE(trace) {
     SkDebugf("%s\n", (const char*)ctx);
 }
@@ -386,7 +394,7 @@ STAGE(scale_1_float) {
 STAGE(scale_u8) {
     auto ptr = *(const uint8_t**)ctx + x;
 
-    SkNf c = SkNx_cast<float>(load(tail, ptr)) * (1/255.0f);
+    SkNf c = SkNf_from_byte(load(tail, ptr));
     r = r*c;
     g = g*c;
     b = b*c;
@@ -411,7 +419,7 @@ STAGE(lerp_1_float) {
 STAGE(lerp_u8) {
     auto ptr = *(const uint8_t**)ctx + x;
 
-    SkNf c = SkNx_cast<float>(load(tail, ptr)) * (1/255.0f);
+    SkNf c = SkNf_from_byte(load(tail, ptr));
     r = lerp(dr, r, c);
     g = lerp(dg, g, c);
     b = lerp(db, b, c);
@@ -570,7 +578,7 @@ STAGE(load_tables) {
     r = gather(tail, loadCtx->fR, to_int((rgba >>  0) & 0xff));
     g = gather(tail, loadCtx->fG, to_int((rgba >>  8) & 0xff));
     b = gather(tail, loadCtx->fB, to_int((rgba >> 16) & 0xff));
-    a = (1/255.0f) * SkNx_cast<float>(to_int(rgba >> 24));
+    a = SkNf_from_byte(rgba >> 24);
 }
 
 STAGE(store_tables) {
@@ -863,7 +871,7 @@ STAGE(gather_a8) {
     SkNi offset = offset_and_ptr(&p, ctx, r, g);
 
     r = g = b = 0.0f;
-    a = SkNx_cast<float>(gather(tail, p, offset)) * (1/255.0f);
+    a = SkNf_from_byte(gather(tail, p, offset));
 }
 STAGE(gather_i8) {
     auto sc = (const SkImageShaderContext*)ctx;
@@ -877,7 +885,7 @@ STAGE(gather_g8) {
     const uint8_t* p;
     SkNi offset = offset_and_ptr(&p, ctx, r, g);
 
-    r = g = b = SkNx_cast<float>(gather(tail, p, offset)) * (1/255.0f);
+    r = g = b = SkNf_from_byte(gather(tail, p, offset));
     a = 1.0f;
 }
 STAGE(gather_565) {
author	Mike Klein <mtklein@chromium.org>	2016-12-02 15:21:03 -0500
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2016-12-05 15:22:37 +0000
commit	e2e2ae23751d5a81f9cc9db8df2f5064108248aa (patch)
tree	7b7906626977bb179d291579cdf0a2e7eec83aa2
parent	55360b11c7da3b19e0c5ae1aa5a7a457cb5e373a (diff)