8 files changed, 68 insertions, 41 deletions
diff --git a/src/core/SkHalf.h b/src/core/SkHalf.h
index 2f2ed66c6a..adf8d3a126 100644
--- a/src/core/SkHalf.h
+++ b/src/core/SkHalf.h
@@ -16,9 +16,10 @@
 // only used for storage
 typedef uint16_t SkHalf;
 
-#define SK_HalfMin      0x0400   // 2^-24  (minimum positive normal value)
-#define SK_HalfMax      0x7bff   // 65504
-#define SK_HalfEpsilon  0x1400   // 2^-10
+static constexpr uint16_t SK_HalfMin     = 0x0400; // 2^-24  (minimum positive normal value)
+static constexpr uint16_t SK_HalfMax     = 0x7bff; // 65504
+static constexpr uint16_t SK_HalfEpsilon = 0x1400; // 2^-10
+static constexpr uint16_t SK_Half1       = 0x3C00; // 1
 
 // convert between half and single precision floating point
 float SkHalfToFloat(SkHalf h);
@@ -26,8 +27,8 @@ SkHalf SkFloatToHalf(float f);
 
 // Convert between half and single precision floating point,
 // assuming inputs and outputs are both finite.
-static inline     Sk4f SkHalfToFloat_finite(uint64_t);
-static inline uint64_t SkFloatToHalf_finite(const Sk4f&);
+static inline Sk4f SkHalfToFloat_finite(uint64_t);
+static inline Sk4h SkFloatToHalf_finite(const Sk4f&);
 
 // ~~~~~~~~~~~ impl ~~~~~~~~~~~~~~ //
 
@@ -65,14 +66,12 @@ static inline Sk4f SkHalfToFloat_finite(uint64_t hs) {
 #endif
 }
 
-static inline uint64_t SkFloatToHalf_finite(const Sk4f& fs) {
-    uint64_t r;
+static inline Sk4h SkFloatToHalf_finite(const Sk4f& fs) {
 #if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64)
     float32x4_t vec = fs.fVec;
     asm ("fcvtn %[vec].4h, %[vec].4s  \n"   // vcvt_f16_f32(vec)
-         "fmov  %[r], %d[vec]         \n"   // vst1_f16(&r, ...)
-        : [r] "=r" (r)                      // =r: write-only 64-bit general register
-        , [vec] "+w" (vec));                // +w: read-write NEON register
+        : [vec] "+w" (vec));                // +w: read-write NEON register
+    return vreinterpret_u16_f32(vget_low_f32(vec));
 #else
     Sk4i bits           = Sk4i::Load(&fs),
          sign           = bits & 0x80000000,              // Save the sign bit for later...
@@ -91,9 +90,8 @@ static inline uint64_t SkFloatToHalf_finite(const Sk4f& fs) {
     Sk4i denorm = Sk4i::Load(&plus_K) ^ K;
 
     Sk4i merged = (sign >> 16) | will_be_denorm.thenElse(denorm, norm);
-    SkNx_cast<uint16_t>(merged).store(&r);
+    return SkNx_cast<uint16_t>(merged);
 #endif
-    return r;
 }
 
 #endif
diff --git a/src/core/SkMipMap.cpp b/src/core/SkMipMap.cpp
index 4811c9e073..cb9cc85a98 100644
--- a/src/core/SkMipMap.cpp
+++ b/src/core/SkMipMap.cpp
@@ -88,7 +88,9 @@ struct ColorTypeFilter_F16 {
         return SkHalfToFloat_finite(x);
     }
     static uint64_t Compact(const Sk4f& x) {
-        return SkFloatToHalf_finite(x);
+        uint64_t r;
+        SkFloatToHalf_finite(x).store(&r);
+        return r;
     }
 };
 
diff --git a/src/core/SkNx.h b/src/core/SkNx.h
index dec63f8d89..253fcf22fe 100644
--- a/src/core/SkNx.h
+++ b/src/core/SkNx.h
@@ -309,6 +309,15 @@ SI Sk4i Sk4f_round(const Sk4f& x) {
              (int) lrintf (x[3]), };
 }
 
+// Transpose 4 Sk4h and store (256 bits total).
+SI void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b, const Sk4h& a) {
+    uint64_t* dst64 = (uint64_t*) dst;
+    Sk4h(r[0], g[0], b[0], a[0]).store(dst64 + 0);
+    Sk4h(r[1], g[1], b[1], a[1]).store(dst64 + 1);
+    Sk4h(r[2], g[2], b[2], a[2]).store(dst64 + 2);
+    Sk4h(r[3], g[3], b[3], a[3]).store(dst64 + 3);
+}
+
 #endif
 
 SI void Sk4f_ToBytes(uint8_t p[16], const Sk4f& a, const Sk4f& b, const Sk4f& c, const Sk4f& d) {
diff --git a/src/core/SkXfermodeF16.cpp b/src/core/SkXfermodeF16.cpp
index 63058f9dce..219e91188e 100644
--- a/src/core/SkXfermodeF16.cpp
+++ b/src/core/SkXfermodeF16.cpp
@@ -25,13 +25,13 @@ static void xfer_1(const SkXfermode* xfer, uint64_t dst[], const SkPM4f* src, in
             Sk4f d4 = SkHalfToFloat_finite(dst[i]);
             d4.store(d.fVec);
             Sk4f r4 = Sk4f::Load(proc(*src, d).fVec);
-            dst[i] = SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i]));
+            SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i])).store(&dst[i]);
         }
     } else {
         for (int i = 0; i < count; ++i) {
             SkHalfToFloat_finite(dst[i]).store(d.fVec);
             Sk4f r4 = Sk4f::Load(proc(*src, d).fVec);
-            dst[i] = SkFloatToHalf_finite(r4);
+            SkFloatToHalf_finite(r4).store(&dst[i]);
         }
     }
 }
@@ -45,13 +45,13 @@ static void xfer_n(const SkXfermode* xfer, uint64_t dst[], const SkPM4f src[], i
             Sk4f d4 = SkHalfToFloat_finite(dst[i]);
             d4.store(d.fVec);
             Sk4f r4 = Sk4f::Load(proc(src[i], d).fVec);
-            dst[i] = SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i]));
+            SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i])).store(&dst[i]);
         }
     } else {
         for (int i = 0; i < count; ++i) {
             SkHalfToFloat_finite(dst[i]).store(d.fVec);
             Sk4f r4 = Sk4f::Load(proc(src[i], d).fVec);
-            dst[i] = SkFloatToHalf_finite(r4);
+            SkFloatToHalf_finite(r4).store(&dst[i]);
         }
     }
 }
@@ -65,7 +65,7 @@ static void clear(const SkXfermode*, uint64_t dst[], const SkPM4f*, int count, c
         for (int i = 0; i < count; ++i) {
             if (aa[i]) {
                 const Sk4f d4 = SkHalfToFloat_finite(dst[i]);
-                dst[i] = SkFloatToHalf_finite(d4 * Sk4f((255 - aa[i]) * 1.0f/255));
+                SkFloatToHalf_finite(d4 * Sk4f((255 - aa[i]) * 1.0f/255)).store(&dst[i]);
             }
         }
     } else {
@@ -83,10 +83,12 @@ static void src_1(const SkXfermode*, uint64_t dst[], const SkPM4f* src, int coun
     if (aa) {
         for (int i = 0; i < count; ++i) {
             const Sk4f d4 = SkHalfToFloat_finite(dst[i]);
-            dst[i] = SkFloatToHalf_finite(lerp_by_coverage(s4, d4, aa[i]));
+            SkFloatToHalf_finite(lerp_by_coverage(s4, d4, aa[i])).store(&dst[i]);
         }
     } else {
-        sk_memset64(dst, SkFloatToHalf_finite(s4), count);
+        uint64_t s4h;
+        SkFloatToHalf_finite(s4).store(&s4h);
+        sk_memset64(dst, s4h, count);
     }
 }
 
@@ -96,12 +98,12 @@ static void src_n(const SkXfermode*, uint64_t dst[], const SkPM4f src[], int cou
         for (int i = 0; i < count; ++i) {
             const Sk4f s4 = Sk4f::Load(src[i].fVec);
             const Sk4f d4 = SkHalfToFloat_finite(dst[i]);
-            dst[i] = SkFloatToHalf_finite(lerp_by_coverage(s4, d4, aa[i]));
+            SkFloatToHalf_finite(lerp_by_coverage(s4, d4, aa[i])).store(&dst[i]);
         }
     } else {
         for (int i = 0; i < count; ++i) {
             const Sk4f s4 = Sk4f::Load(src[i].fVec);
-            dst[i] = SkFloatToHalf_finite(s4);
+            SkFloatToHalf_finite(s4).store(&dst[i]);
         }
     }
 }
@@ -124,9 +126,9 @@ static void srcover_1(const SkXfermode*, uint64_t dst[], const SkPM4f* src, int
         const Sk4f d4 = SkHalfToFloat_finite(dst[i]);
         const Sk4f r4 = s4 + d4 * dst_scale;
         if (aa) {
-            dst[i] = SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i]));
+            SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i])).store(&dst[i]);
         } else {
-            dst[i] = SkFloatToHalf_finite(r4);
+            SkFloatToHalf_finite(r4).store(&dst[i]);
         }
     }
 }
@@ -140,7 +142,7 @@ static void srcover_n(const SkXfermode*, uint64_t dst[], const SkPM4f src[], int
         if (aa) {
             r = lerp_by_coverage(r, d, aa[i]);
         }
-        dst[i] = SkFloatToHalf_finite(r);
+        SkFloatToHalf_finite(r).store(&dst[i]);
     }
 }
 
diff --git a/src/effects/gradients/Sk4fGradientPriv.h b/src/effects/gradients/Sk4fGradientPriv.h
index 68e95a63d9..9745119fd4 100644
--- a/src/effects/gradients/Sk4fGradientPriv.h
+++ b/src/effects/gradients/Sk4fGradientPriv.h
@@ -143,11 +143,13 @@ struct DstTraits<DstType::F16, premul> {
     }
 
     static void store(const Sk4f& c, Type* dst) {
-        *dst = SkFloatToHalf_finite(PM::apply(c));
+        SkFloatToHalf_finite(PM::apply(c)).store(dst);
     }
 
     static void store(const Sk4f& c, Type* dst, int n) {
-        sk_memset64(dst, SkFloatToHalf_finite(PM::apply(c)), n);
+        uint64_t color;
+        SkFloatToHalf_finite(PM::apply(c)).store(&color);
+        sk_memset64(dst, color, n);
     }
 
     static void store4x(const Sk4f& c0, const Sk4f& c1,
diff --git a/src/opts/SkColorXform_opts.h b/src/opts/SkColorXform_opts.h
index e7a2b45946..3bb11f5599 100644
--- a/src/opts/SkColorXform_opts.h
+++ b/src/opts/SkColorXform_opts.h
@@ -126,16 +126,10 @@ static void color_xform_RGB1(void* dst, const uint32_t* src, int len,
 
                 dst = SkTAddOffset<void>(dst, 4 * sizeof(uint32_t));
             } else {
-                // FIXME (msarett):
-                // Can we do better here?  Should we store half floats as planar?
-                // Should we write Intel/Arm specific code?  Should we add a transpose
-                // function to SkNx?  Should we rewrite the algorithm to be interleaved?
-                uint64_t* dst64 = (uint64_t*) dst;
-                dst64[0] = SkFloatToHalf_finite(Sk4f(dstReds[0], dstGreens[0], dstBlues[0], 1.0f));
-                dst64[1] = SkFloatToHalf_finite(Sk4f(dstReds[1], dstGreens[1], dstBlues[1], 1.0f));
-                dst64[2] = SkFloatToHalf_finite(Sk4f(dstReds[2], dstGreens[2], dstBlues[2], 1.0f));
-                dst64[3] = SkFloatToHalf_finite(Sk4f(dstReds[3], dstGreens[3], dstBlues[3], 1.0f));
-
+                Sk4h_store4(dst, SkFloatToHalf_finite(dstReds),
+                                 SkFloatToHalf_finite(dstGreens),
+                                 SkFloatToHalf_finite(dstBlues),
+                                 SK_Half1);
                 dst = SkTAddOffset<void>(dst, 4 * sizeof(uint64_t));
             }
         };
@@ -185,10 +179,9 @@ static void color_xform_RGB1(void* dst, const uint32_t* src, int len,
 
             dst = SkTAddOffset<void>(dst, sizeof(uint32_t));
         } else {
-            uint64_t rgba = SkFloatToHalf_finite(dstPixel);
-
-            // Set alpha to 1.0
-            rgba |= 0x3C00000000000000;
+            uint64_t rgba;
+            SkFloatToHalf_finite(dstPixel).store(&rgba);
+            rgba |= static_cast<uint64_t>(SK_Half1) << 48;
             *((uint64_t*) dst) = rgba;
             dst = SkTAddOffset<void>(dst, sizeof(uint64_t));
         }
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index 2f73e0368d..53e95eb97f 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -483,4 +483,15 @@ static inline Sk4i Sk4f_round(const Sk4f& x) {
     return vcvtq_s32_f32((x + 0.5f).fVec);
 }
 
+static inline void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b,
+                               const Sk4h& a) {
+    uint16x4x4_t rgba = {{
+        r.fVec,
+        g.fVec,
+        b.fVec,
+        a.fVec,
+    }};
+    vst4_u16((uint16_t*) dst, rgba);
+}
+
 #endif//SkNx_neon_DEFINED
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index 553ff1de3e..c0e48287b4 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -403,4 +403,14 @@ static inline Sk4i Sk4f_round(const Sk4f& x) {
     return _mm_cvtps_epi32(x.fVec);
 }
 
+static inline void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b,
+                               const Sk4h& a) {
+    __m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec);
+    __m128i ba = _mm_unpacklo_epi16(b.fVec, a.fVec);
+    __m128i lo = _mm_unpacklo_epi32(rg, ba);
+    __m128i hi = _mm_unpackhi_epi32(rg, ba);
+    _mm_storeu_si128(((__m128i*) dst) + 0, lo);
+    _mm_storeu_si128(((__m128i*) dst) + 1, hi);
+}
+
 #endif//SkNx_sse_DEFINED