Add Sk4h_load4 for loading F16.

Should feel very similar to Sk4h_store4: NEON uses its native instruction, SSE unpacks manually. Since we'll have our F16s in 4 Sk4h by the time we're done here, this also extracts an Sk4h->Sk4f routine from the old uint64_t->Sk4f one. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2184753002 CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Review-Url: https://codereview.chromium.org/2184753002
author: mtklein <mtklein@chromium.org> 2016-07-26 08:01:19 -0700
committer: Commit bot <commit-bot@chromium.org> 2016-07-26 08:01:20 -0700
commit: f660b7cfcfbf3062f88e61f8320ea7051da72213 (patch)
tree: fca0db0effa07c70ea7f0b64f84b7f6e954c1944 /src/core/SkHalf.h
parent: 8c523e0f3ffa66eefd70f893e9f863b7d9ea3dc9 (diff)
1 files changed, 11 insertions, 8 deletions
diff --git a/src/core/SkHalf.h b/src/core/SkHalf.h
index adf8d3a126..bc9dd7940f 100644
--- a/src/core/SkHalf.h
+++ b/src/core/SkHalf.h
@@ -37,19 +37,18 @@ static inline Sk4h SkFloatToHalf_finite(const Sk4f&);
 
 // GCC 4.9 lacks the intrinsics to use ARMv8 f16<->f32 instructions, so we use inline assembly.
 
-static inline Sk4f SkHalfToFloat_finite(uint64_t hs) {
+static inline Sk4f SkHalfToFloat_finite(const Sk4h& hs) {
 #if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64)
     float32x4_t fs;
-    asm ("fmov  %d[fs], %[hs]        \n"   // vcreate_f16(hs)
-         "fcvtl %[fs].4s, %[fs].4h   \n"   // vcvt_f32_f16(...)
+    asm ("fcvtl %[fs].4s, %[hs].4h   \n"   // vcvt_f32_f16(...)
         : [fs] "=w" (fs)                   // =w: write-only NEON register
-        : [hs] "r" (hs));                  //  r: read-only 64-bit general register
+        : [hs] "w" (hs.fVec));             //  w: read-only NEON register
     return fs;
 #else
-    Sk4i bits      = SkNx_cast<int>(Sk4h::Load(&hs)),   // Expand to 32 bit.
-         sign      = bits & 0x00008000,                 // Save the sign bit for later...
-         positive  = bits ^ sign,                       // ...but strip it off for now.
-         is_denorm = positive < (1<<10);                // Exponent == 0?
+    Sk4i bits      = SkNx_cast<int>(hs),   // Expand to 32 bit.
+         sign      = bits & 0x00008000,    // Save the sign bit for later...
+         positive  = bits ^ sign,          // ...but strip it off for now.
+         is_denorm = positive < (1<<10);   // Exponent == 0?
 
     // For normal half floats, extend the mantissa by 13 zero bits,
     // then adjust the exponent from 15 bias to 127 bias.
@@ -66,6 +65,10 @@ static inline Sk4f SkHalfToFloat_finite(uint64_t hs) {
 #endif
 }
 
+static inline Sk4f SkHalfToFloat_finite(uint64_t hs) {
+    return SkHalfToFloat_finite(Sk4h::Load(&hs));
+}
+
 static inline Sk4h SkFloatToHalf_finite(const Sk4f& fs) {
 #if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64)
     float32x4_t vec = fs.fVec;
author	mtklein <mtklein@chromium.org>	2016-07-26 08:01:19 -0700
committer	Commit bot <commit-bot@chromium.org>	2016-07-26 08:01:20 -0700
commit	f660b7cfcfbf3062f88e61f8320ea7051da72213 (patch)
tree	fca0db0effa07c70ea7f0b64f84b7f6e954c1944 /src/core/SkHalf.h
parent	8c523e0f3ffa66eefd70f893e9f863b7d9ea3dc9 (diff)