diff options
author | 2017-04-05 10:18:27 -0400 | |
---|---|---|
committer | 2017-04-05 15:05:31 +0000 | |
commit | fa6eb915042db25b23f3a37a17d9294cb0723356 (patch) | |
tree | 6c156b9e5fa10376146a92842b13b0b2ef0d430d /src | |
parent | 33aa2c7b5c351c17a11ce2bf7d7cd70a3d86cecc (diff) |
finish up load4/store4 refactoring
I saved the easiest for last. No generated code diff for store_f32.
This just moves the platform-specific code over to SkJumper_vectors.h
Also clarify types in the existing load4()/store4() functions.
SkJumper_stages.cpp looks good to start growing again!
Change-Id: I6a8599d090b4e17663703b0c0325dbe550a6cdd8
Reviewed-on: https://skia-review.googlesource.com/11348
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src')
-rw-r--r-- | src/jumper/SkJumper_stages.cpp | 60 | ||||
-rw-r--r-- | src/jumper/SkJumper_vectors.h | 78 |
2 files changed, 67 insertions, 71 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index fa64e805d6..acbec8b03b 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -624,71 +624,23 @@ STAGE(load_f16) { auto ptr = *(const uint64_t**)ctx + x; U16 R,G,B,A; - load4(ptr,tail, &R,&G,&B,&A); + load4((const uint16_t*)ptr,tail, &R,&G,&B,&A); r = from_half(R); g = from_half(G); b = from_half(B); a = from_half(A); } - STAGE(store_f16) { auto ptr = *(uint64_t**)ctx + x; - - store4(ptr,tail, to_half(r) - , to_half(g) - , to_half(b) - , to_half(a)); + store4((uint16_t*)ptr,tail, to_half(r) + , to_half(g) + , to_half(b) + , to_half(a)); } STAGE(store_f32) { auto ptr = *(float**)ctx + 4*x; - -#if !defined(JUMPER) - ptr[0] = r; - ptr[1] = g; - ptr[2] = b; - ptr[3] = a; -#elif defined(__aarch64__) - vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}})); -#elif defined(__arm__) - vst4_f32(ptr, (float32x2x4_t{{r,g,b,a}})); -#elif defined(__AVX__) - F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5 - rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ... - ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5 - ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ... - - F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4 - _15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ... - _26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ... - _37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ... - - if (__builtin_expect(tail, 0)) { - if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); } - if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); } - if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); } - if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); } - if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); } - if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); } - if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); } - } else { - F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo - _23 = _mm256_permute2f128_ps(_26, _37, 32), - _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi - _67 = _mm256_permute2f128_ps(_26, _37, 49); - _mm256_storeu_ps(ptr+ 0, _01); - _mm256_storeu_ps(ptr+ 8, _23); - _mm256_storeu_ps(ptr+16, _45); - _mm256_storeu_ps(ptr+24, _67); - } -#elif defined(__SSE2__) - auto v0 = r, v1 = g, v2 = b, v3 = a; - _MM_TRANSPOSE4_PS(v0, v1, v2, v3); - memcpy(ptr+ 0, &v0, sizeof(v0)); - memcpy(ptr+ 4, &v1, sizeof(v1)); - memcpy(ptr+ 8, &v2, sizeof(v2)); - memcpy(ptr+12, &v3, sizeof(v3)); -#endif + store4(ptr,tail, r,g,b,a); } SI F ulp_before(F v) { diff --git a/src/jumper/SkJumper_vectors.h b/src/jumper/SkJumper_vectors.h index 1685da9aa9..1b72ce7825 100644 --- a/src/jumper/SkJumper_vectors.h +++ b/src/jumper/SkJumper_vectors.h @@ -41,15 +41,19 @@ SI F gather(const float* p, U32 ix) { return p[ix]; } - SI void load4(const void* vptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { - auto ptr = (const uint16_t*)vptr; + SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { *r = ptr[0]; *g = ptr[1]; *b = ptr[2]; *a = ptr[3]; } - SI void store4(void* vptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { - auto ptr = (uint16_t*)vptr; + SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { + ptr[0] = r; + ptr[1] = g; + ptr[2] = b; + ptr[3] = a; + } + SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { ptr[0] = r; ptr[1] = g; ptr[2] = b; @@ -92,16 +96,18 @@ SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; } - SI void load4(const void* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { - uint16x4x4_t rgba = vld4_u16((const uint16_t*)ptr); + SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { + uint16x4x4_t rgba = vld4_u16(ptr); *r = rgba.val[0]; *g = rgba.val[1]; *b = rgba.val[2]; *a = rgba.val[3]; } - SI void store4(void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { - uint16x4x4_t rgba = {{r,g,b,a}}; - vst4_u16((uint16_t*)ptr, rgba); + SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { + vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}})); + } + SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { + vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}})); } SI F from_half(U16 h) { return vcvt_f32_f16(h); } @@ -139,8 +145,7 @@ SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; } - SI void load4(const void* vptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { - auto ptr = (const uint16_t*)vptr; + SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { uint16x4x4_t rgba; rgba = vld4_lane_u16(ptr + 0, rgba, 0); rgba = vld4_lane_u16(ptr + 4, rgba, 1); @@ -149,8 +154,7 @@ *b = unaligned_load<U16>(rgba.val+2); *a = unaligned_load<U16>(rgba.val+3); } - SI void store4(void* vptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { - auto ptr = (uint16_t*)vptr; + SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { uint16x4x4_t rgba = {{ widen_cast<uint16x4_t>(r), widen_cast<uint16x4_t>(g), @@ -160,6 +164,9 @@ vst4_lane_u16(ptr + 0, rgba, 0); vst4_lane_u16(ptr + 4, rgba, 1); } + SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { + vst4_f32(ptr, (float32x2x4_t{{r,g,b,a}})); + } SI F from_half(U16 h) { auto v = widen_cast<uint16x4_t>(h); @@ -217,7 +224,7 @@ #endif } - SI void load4(const void* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { + SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { __m128i _01, _23, _45, _67; if (__builtin_expect(tail,0)) { auto src = (const double*)ptr; @@ -251,7 +258,7 @@ *b = _mm_unpacklo_epi64(ba0123, ba4567); *a = _mm_unpackhi_epi64(ba0123, ba4567); } - SI void store4(void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { + SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3 rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7 ba0123 = _mm_unpacklo_epi16(b, a), @@ -278,6 +285,36 @@ _mm_storeu_si128((__m128i*)ptr + 3, _67); } } + SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { + F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5 + rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ... + ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5 + ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ... + + F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4 + _15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ... + _26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ... + _37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ... + + if (__builtin_expect(tail, 0)) { + if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); } + if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); } + if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); } + if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); } + if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); } + if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); } + if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); } + } else { + F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo + _23 = _mm256_permute2f128_ps(_26, _37, 32), + _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi + _67 = _mm256_permute2f128_ps(_26, _37, 49); + _mm256_storeu_ps(ptr+ 0, _01); + _mm256_storeu_ps(ptr+ 8, _23); + _mm256_storeu_ps(ptr+16, _45); + _mm256_storeu_ps(ptr+24, _67); + } + } SI F from_half(U16 h) { #if defined(__AVX2__) @@ -350,7 +387,7 @@ SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; } - SI void load4(const void* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { + SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0), _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); @@ -365,12 +402,19 @@ *b = unaligned_load<U16>((uint16_t*)&ba + 0); *a = unaligned_load<U16>((uint16_t*)&ba + 4); } - SI void store4(const void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { + SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)), ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a)); _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba)); _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba)); } + SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { + _MM_TRANSPOSE4_PS(r,g,b,a); + _mm_storeu_ps(ptr+ 0, r); + _mm_storeu_ps(ptr+ 4, g); + _mm_storeu_ps(ptr+ 8, b); + _mm_storeu_ps(ptr+12, a); + } SI F from_half(U16 h) { auto v = widen_cast<__m128i>(h); |