aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar mtklein <mtklein@chromium.org>2016-07-26 08:01:19 -0700
committerGravatar Commit bot <commit-bot@chromium.org>2016-07-26 08:01:20 -0700
commitf660b7cfcfbf3062f88e61f8320ea7051da72213 (patch)
treefca0db0effa07c70ea7f0b64f84b7f6e954c1944
parent8c523e0f3ffa66eefd70f893e9f863b7d9ea3dc9 (diff)
Add Sk4h_load4 for loading F16.
Should feel very similar to Sk4h_store4: NEON uses its native instruction, SSE unpacks manually. Since we'll have our F16s in 4 Sk4h by the time we're done here, this also extracts an Sk4h->Sk4f routine from the old uint64_t->Sk4f one. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2184753002 CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Review-Url: https://codereview.chromium.org/2184753002
-rw-r--r--src/core/SkHalf.h19
-rw-r--r--src/core/SkNx.h13
-rw-r--r--src/core/SkRasterPipelineBlitter.cpp16
-rw-r--r--src/opts/SkNx_neon.h8
-rw-r--r--src/opts/SkNx_sse.h13
5 files changed, 52 insertions, 17 deletions
diff --git a/src/core/SkHalf.h b/src/core/SkHalf.h
index adf8d3a126..bc9dd7940f 100644
--- a/src/core/SkHalf.h
+++ b/src/core/SkHalf.h
@@ -37,19 +37,18 @@ static inline Sk4h SkFloatToHalf_finite(const Sk4f&);
// GCC 4.9 lacks the intrinsics to use ARMv8 f16<->f32 instructions, so we use inline assembly.
-static inline Sk4f SkHalfToFloat_finite(uint64_t hs) {
+static inline Sk4f SkHalfToFloat_finite(const Sk4h& hs) {
#if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64)
float32x4_t fs;
- asm ("fmov %d[fs], %[hs] \n" // vcreate_f16(hs)
- "fcvtl %[fs].4s, %[fs].4h \n" // vcvt_f32_f16(...)
+ asm ("fcvtl %[fs].4s, %[hs].4h \n" // vcvt_f32_f16(...)
: [fs] "=w" (fs) // =w: write-only NEON register
- : [hs] "r" (hs)); // r: read-only 64-bit general register
+ : [hs] "w" (hs.fVec)); // w: read-only NEON register
return fs;
#else
- Sk4i bits = SkNx_cast<int>(Sk4h::Load(&hs)), // Expand to 32 bit.
- sign = bits & 0x00008000, // Save the sign bit for later...
- positive = bits ^ sign, // ...but strip it off for now.
- is_denorm = positive < (1<<10); // Exponent == 0?
+ Sk4i bits = SkNx_cast<int>(hs), // Expand to 32 bit.
+ sign = bits & 0x00008000, // Save the sign bit for later...
+ positive = bits ^ sign, // ...but strip it off for now.
+ is_denorm = positive < (1<<10); // Exponent == 0?
// For normal half floats, extend the mantissa by 13 zero bits,
// then adjust the exponent from 15 bias to 127 bias.
@@ -66,6 +65,10 @@ static inline Sk4f SkHalfToFloat_finite(uint64_t hs) {
#endif
}
+static inline Sk4f SkHalfToFloat_finite(uint64_t hs) {
+ return SkHalfToFloat_finite(Sk4h::Load(&hs));
+}
+
static inline Sk4h SkFloatToHalf_finite(const Sk4f& fs) {
#if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64)
float32x4_t vec = fs.fVec;
diff --git a/src/core/SkNx.h b/src/core/SkNx.h
index 253fcf22fe..308addd928 100644
--- a/src/core/SkNx.h
+++ b/src/core/SkNx.h
@@ -309,6 +309,19 @@ SI Sk4i Sk4f_round(const Sk4f& x) {
(int) lrintf (x[3]), };
}
+// Load 4 Sk4h and transpose them (256 bits total).
+SI void Sk4h_load4(const void* vptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h* a) {
+ const uint64_t* ptr = (const uint64_t*)vptr;
+ auto p0 = Sk4h::Load(ptr+0),
+ p1 = Sk4h::Load(ptr+1),
+ p2 = Sk4h::Load(ptr+2),
+ p3 = Sk4h::Load(ptr+3);
+ *r = { p0[0], p1[0], p2[0], p3[0] };
+ *g = { p0[1], p1[1], p2[1], p3[1] };
+ *b = { p0[2], p1[2], p2[2], p3[2] };
+ *a = { p0[3], p1[3], p2[3], p3[3] };
+}
+
// Transpose 4 Sk4h and store (256 bits total).
SI void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b, const Sk4h& a) {
uint64_t* dst64 = (uint64_t*) dst;
diff --git a/src/core/SkRasterPipelineBlitter.cpp b/src/core/SkRasterPipelineBlitter.cpp
index 6518a88fa9..a27abbdc38 100644
--- a/src/core/SkRasterPipelineBlitter.cpp
+++ b/src/core/SkRasterPipelineBlitter.cpp
@@ -231,15 +231,13 @@ static void SK_VECTORCALL load_d_f16(SkRasterPipeline::Stage* st, size_t x,
Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
auto ptr = st->ctx<const uint64_t*>() + x;
- // TODO: This can be made a lot more efficient with platform-specific code.
- auto p0 = SkHalfToFloat_finite(ptr[0]),
- p1 = SkHalfToFloat_finite(ptr[1]),
- p2 = SkHalfToFloat_finite(ptr[2]),
- p3 = SkHalfToFloat_finite(ptr[3]);
- dr = { p0[0], p1[0], p2[0], p3[0] };
- dg = { p0[1], p1[1], p2[1], p3[1] };
- db = { p0[2], p1[2], p2[2], p3[2] };
- da = { p0[3], p1[3], p2[3], p3[3] };
+ Sk4h rh, gh, bh, ah;
+ Sk4h_load4(ptr, &rh, &gh, &bh, &ah);
+
+ dr = SkHalfToFloat_finite(rh);
+ dg = SkHalfToFloat_finite(gh);
+ db = SkHalfToFloat_finite(bh);
+ da = SkHalfToFloat_finite(ah);
st->next(x, r,g,b,a, dr,dg,db,da);
}
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index 53e95eb97f..df11de3d5f 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -483,6 +483,14 @@ static inline Sk4i Sk4f_round(const Sk4f& x) {
return vcvtq_s32_f32((x + 0.5f).fVec);
}
+static inline void Sk4h_load4(const void* ptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h* a) {
+ uint16x4x4_t rgba = vld4_u16((const uint16_t*)ptr);
+ *r = rgba.val[0];
+ *g = rgba.val[1];
+ *b = rgba.val[2];
+ *a = rgba.val[3];
+}
+
static inline void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b,
const Sk4h& a) {
uint16x4x4_t rgba = {{
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index c0e48287b4..3881b54e3a 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -403,6 +403,19 @@ static inline Sk4i Sk4f_round(const Sk4f& x) {
return _mm_cvtps_epi32(x.fVec);
}
+static inline void Sk4h_load4(const void* ptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h* a) {
+ __m128i lo = _mm_loadu_si128(((__m128i*)ptr) + 0),
+ hi = _mm_loadu_si128(((__m128i*)ptr) + 1);
+ __m128i even = _mm_unpacklo_epi16(lo, hi), // r0 r2 g0 g2 b0 b2 a0 a2
+ odd = _mm_unpackhi_epi16(lo, hi); // r1 r3 ...
+ __m128i rg = _mm_unpacklo_epi16(even, odd), // r0 r1 r2 r3 g0 g1 g2 g3
+ ba = _mm_unpackhi_epi16(even, odd); // b0 b1 ... a0 a1 ...
+ *r = rg;
+ *g = _mm_srli_si128(rg, 8);
+ *b = ba;
+ *a = _mm_srli_si128(ba, 8);
+}
+
static inline void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b,
const Sk4h& a) {
__m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec);