aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper/SkJumper_vectors.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/jumper/SkJumper_vectors.h')
-rw-r--r--src/jumper/SkJumper_vectors.h90
1 files changed, 81 insertions, 9 deletions
diff --git a/src/jumper/SkJumper_vectors.h b/src/jumper/SkJumper_vectors.h
index 3e9edd8269..1685da9aa9 100644
--- a/src/jumper/SkJumper_vectors.h
+++ b/src/jumper/SkJumper_vectors.h
@@ -48,12 +48,23 @@
*b = ptr[2];
*a = ptr[3];
}
+ SI void store4(void* vptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+ auto ptr = (uint16_t*)vptr;
+ ptr[0] = r;
+ ptr[1] = g;
+ ptr[2] = b;
+ ptr[3] = a;
+ }
SI F from_half(U16 h) {
if ((int16_t)h < 0x0400) { h = 0; } // Flush denorm and negative to zero.
return bit_cast<F>(h << 13) // Line up the mantissa,
* bit_cast<F>(U32(0x77800000)); // then fix up the exponent.
}
+ SI U16 to_half(F f) {
+ return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent,
+ >> 13; // then line up the mantissa.
+ }
#elif defined(__aarch64__)
#include <arm_neon.h>
@@ -88,11 +99,14 @@
*b = rgba.val[2];
*a = rgba.val[3];
}
-
- SI F from_half(U16 h) {
- return vcvt_f32_f16(h);
+ SI void store4(void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+ uint16x4x4_t rgba = {{r,g,b,a}};
+ vst4_u16((uint16_t*)ptr, rgba);
}
+ SI F from_half(U16 h) { return vcvt_f32_f16(h); }
+ SI U16 to_half(F f) { return vcvt_f16_f32(f); }
+
#elif defined(__arm__)
#if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
#error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
@@ -135,12 +149,27 @@
*b = unaligned_load<U16>(rgba.val+2);
*a = unaligned_load<U16>(rgba.val+3);
}
+ SI void store4(void* vptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+ auto ptr = (uint16_t*)vptr;
+ uint16x4x4_t rgba = {{
+ widen_cast<uint16x4_t>(r),
+ widen_cast<uint16x4_t>(g),
+ widen_cast<uint16x4_t>(b),
+ widen_cast<uint16x4_t>(a),
+ }};
+ vst4_lane_u16(ptr + 0, rgba, 0);
+ vst4_lane_u16(ptr + 4, rgba, 1);
+ }
SI F from_half(U16 h) {
- uint16x4_t v;
- memcpy(&v, &h, sizeof(h));
+ auto v = widen_cast<uint16x4_t>(h);
return vget_low_f32(vcvt_f32_f16(v));
}
+ SI U16 to_half(F f) {
+ auto v = widen_cast<float32x4_t>(f);
+ uint16x4_t h = vcvt_f16_f32(v);
+ return unaligned_load<U16>(&h);
+ }
#elif defined(__AVX__)
#include <immintrin.h>
@@ -222,6 +251,33 @@
*b = _mm_unpacklo_epi64(ba0123, ba4567);
*a = _mm_unpackhi_epi64(ba0123, ba4567);
}
+ SI void store4(void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+ auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3
+ rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7
+ ba0123 = _mm_unpacklo_epi16(b, a),
+ ba4567 = _mm_unpackhi_epi16(b, a);
+
+ auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
+ _23 = _mm_unpackhi_epi32(rg0123, ba0123),
+ _45 = _mm_unpacklo_epi32(rg4567, ba4567),
+ _67 = _mm_unpackhi_epi32(rg4567, ba4567);
+
+ if (__builtin_expect(tail,0)) {
+ auto dst = (double*)ptr;
+ if (tail > 0) { _mm_storel_pd(dst+0, _01); }
+ if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
+ if (tail > 2) { _mm_storel_pd(dst+2, _23); }
+ if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
+ if (tail > 4) { _mm_storel_pd(dst+4, _45); }
+ if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
+ if (tail > 6) { _mm_storel_pd(dst+6, _67); }
+ } else {
+ _mm_storeu_si128((__m128i*)ptr + 0, _01);
+ _mm_storeu_si128((__m128i*)ptr + 1, _23);
+ _mm_storeu_si128((__m128i*)ptr + 2, _45);
+ _mm_storeu_si128((__m128i*)ptr + 3, _67);
+ }
+ }
SI F from_half(U16 h) {
#if defined(__AVX2__)
@@ -237,6 +293,14 @@
* bit_cast<F>(U32(0x77800000_i)); // then fix up the exponent.
#endif
}
+ SI U16 to_half(F f) {
+ #if defined(__AVX2__)
+ return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
+ #else
+ return pack(bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent,
+ >> 13); // then line up the mantissa.
+ #endif
+ }
#elif defined(__SSE2__)
#include <immintrin.h>
@@ -266,8 +330,7 @@
return unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
}
SI U8 pack(U16 v) {
- __m128i r;
- memcpy(&r, &v, sizeof(v));
+ auto r = widen_cast<__m128i>(v);
r = _mm_packus_epi16(r,r);
return unaligned_load<U8>(&r);
}
@@ -302,10 +365,15 @@
*b = unaligned_load<U16>((uint16_t*)&ba + 0);
*a = unaligned_load<U16>((uint16_t*)&ba + 4);
}
+ SI void store4(const void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+ auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
+ ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a));
+ _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
+ _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
+ }
SI F from_half(U16 h) {
- __m128i v;
- memcpy(&v, &h, sizeof(h));
+ auto v = widen_cast<__m128i>(h);
// Same deal as AVX: flush denorms and negatives to zero.
v = _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(0x04000400_i)), v);
@@ -314,6 +382,10 @@
return bit_cast<F>(w << 13) // Line up the mantissa,
* bit_cast<F>(U32(0x77800000_i)); // then fix up the exponent.
}
+ SI U16 to_half(F f) {
+ return pack(bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent,
+ >> 13); // then line up the mantissa.
+ }
#endif
// We need to be a careful with casts.