aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-04-05 10:18:27 -0400
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-04-05 15:05:31 +0000
commitfa6eb915042db25b23f3a37a17d9294cb0723356 (patch)
tree6c156b9e5fa10376146a92842b13b0b2ef0d430d /src
parent33aa2c7b5c351c17a11ce2bf7d7cd70a3d86cecc (diff)
finish up load4/store4 refactoring
I saved the easiest for last. No generated code diff for store_f32. This just moves the platform-specific code over to SkJumper_vectors.h Also clarify types in the existing load4()/store4() functions. SkJumper_stages.cpp looks good to start growing again! Change-Id: I6a8599d090b4e17663703b0c0325dbe550a6cdd8 Reviewed-on: https://skia-review.googlesource.com/11348 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src')
-rw-r--r--src/jumper/SkJumper_stages.cpp60
-rw-r--r--src/jumper/SkJumper_vectors.h78
2 files changed, 67 insertions, 71 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index fa64e805d6..acbec8b03b 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -624,71 +624,23 @@ STAGE(load_f16) {
auto ptr = *(const uint64_t**)ctx + x;
U16 R,G,B,A;
- load4(ptr,tail, &R,&G,&B,&A);
+ load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
r = from_half(R);
g = from_half(G);
b = from_half(B);
a = from_half(A);
}
-
STAGE(store_f16) {
auto ptr = *(uint64_t**)ctx + x;
-
- store4(ptr,tail, to_half(r)
- , to_half(g)
- , to_half(b)
- , to_half(a));
+ store4((uint16_t*)ptr,tail, to_half(r)
+ , to_half(g)
+ , to_half(b)
+ , to_half(a));
}
STAGE(store_f32) {
auto ptr = *(float**)ctx + 4*x;
-
-#if !defined(JUMPER)
- ptr[0] = r;
- ptr[1] = g;
- ptr[2] = b;
- ptr[3] = a;
-#elif defined(__aarch64__)
- vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
-#elif defined(__arm__)
- vst4_f32(ptr, (float32x2x4_t{{r,g,b,a}}));
-#elif defined(__AVX__)
- F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5
- rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ...
- ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5
- ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ...
-
- F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4
- _15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ...
- _26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ...
- _37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ...
-
- if (__builtin_expect(tail, 0)) {
- if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); }
- if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); }
- if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); }
- if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); }
- if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); }
- if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); }
- if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); }
- } else {
- F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo
- _23 = _mm256_permute2f128_ps(_26, _37, 32),
- _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi
- _67 = _mm256_permute2f128_ps(_26, _37, 49);
- _mm256_storeu_ps(ptr+ 0, _01);
- _mm256_storeu_ps(ptr+ 8, _23);
- _mm256_storeu_ps(ptr+16, _45);
- _mm256_storeu_ps(ptr+24, _67);
- }
-#elif defined(__SSE2__)
- auto v0 = r, v1 = g, v2 = b, v3 = a;
- _MM_TRANSPOSE4_PS(v0, v1, v2, v3);
- memcpy(ptr+ 0, &v0, sizeof(v0));
- memcpy(ptr+ 4, &v1, sizeof(v1));
- memcpy(ptr+ 8, &v2, sizeof(v2));
- memcpy(ptr+12, &v3, sizeof(v3));
-#endif
+ store4(ptr,tail, r,g,b,a);
}
SI F ulp_before(F v) {
diff --git a/src/jumper/SkJumper_vectors.h b/src/jumper/SkJumper_vectors.h
index 1685da9aa9..1b72ce7825 100644
--- a/src/jumper/SkJumper_vectors.h
+++ b/src/jumper/SkJumper_vectors.h
@@ -41,15 +41,19 @@
SI F gather(const float* p, U32 ix) { return p[ix]; }
- SI void load4(const void* vptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
- auto ptr = (const uint16_t*)vptr;
+ SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
*r = ptr[0];
*g = ptr[1];
*b = ptr[2];
*a = ptr[3];
}
- SI void store4(void* vptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
- auto ptr = (uint16_t*)vptr;
+ SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+ ptr[0] = r;
+ ptr[1] = g;
+ ptr[2] = b;
+ ptr[3] = a;
+ }
+ SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
ptr[0] = r;
ptr[1] = g;
ptr[2] = b;
@@ -92,16 +96,18 @@
SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
- SI void load4(const void* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
- uint16x4x4_t rgba = vld4_u16((const uint16_t*)ptr);
+ SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
+ uint16x4x4_t rgba = vld4_u16(ptr);
*r = rgba.val[0];
*g = rgba.val[1];
*b = rgba.val[2];
*a = rgba.val[3];
}
- SI void store4(void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
- uint16x4x4_t rgba = {{r,g,b,a}};
- vst4_u16((uint16_t*)ptr, rgba);
+ SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+ vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}}));
+ }
+ SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
+ vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
}
SI F from_half(U16 h) { return vcvt_f32_f16(h); }
@@ -139,8 +145,7 @@
SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
- SI void load4(const void* vptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
- auto ptr = (const uint16_t*)vptr;
+ SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
uint16x4x4_t rgba;
rgba = vld4_lane_u16(ptr + 0, rgba, 0);
rgba = vld4_lane_u16(ptr + 4, rgba, 1);
@@ -149,8 +154,7 @@
*b = unaligned_load<U16>(rgba.val+2);
*a = unaligned_load<U16>(rgba.val+3);
}
- SI void store4(void* vptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
- auto ptr = (uint16_t*)vptr;
+ SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
uint16x4x4_t rgba = {{
widen_cast<uint16x4_t>(r),
widen_cast<uint16x4_t>(g),
@@ -160,6 +164,9 @@
vst4_lane_u16(ptr + 0, rgba, 0);
vst4_lane_u16(ptr + 4, rgba, 1);
}
+ SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
+ vst4_f32(ptr, (float32x2x4_t{{r,g,b,a}}));
+ }
SI F from_half(U16 h) {
auto v = widen_cast<uint16x4_t>(h);
@@ -217,7 +224,7 @@
#endif
}
- SI void load4(const void* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
+ SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
__m128i _01, _23, _45, _67;
if (__builtin_expect(tail,0)) {
auto src = (const double*)ptr;
@@ -251,7 +258,7 @@
*b = _mm_unpacklo_epi64(ba0123, ba4567);
*a = _mm_unpackhi_epi64(ba0123, ba4567);
}
- SI void store4(void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+ SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3
rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7
ba0123 = _mm_unpacklo_epi16(b, a),
@@ -278,6 +285,36 @@
_mm_storeu_si128((__m128i*)ptr + 3, _67);
}
}
+ SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
+ F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5
+ rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ...
+ ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5
+ ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ...
+
+ F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4
+ _15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ...
+ _26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ...
+ _37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ...
+
+ if (__builtin_expect(tail, 0)) {
+ if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); }
+ if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); }
+ if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); }
+ if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); }
+ if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); }
+ if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); }
+ if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); }
+ } else {
+ F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo
+ _23 = _mm256_permute2f128_ps(_26, _37, 32),
+ _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi
+ _67 = _mm256_permute2f128_ps(_26, _37, 49);
+ _mm256_storeu_ps(ptr+ 0, _01);
+ _mm256_storeu_ps(ptr+ 8, _23);
+ _mm256_storeu_ps(ptr+16, _45);
+ _mm256_storeu_ps(ptr+24, _67);
+ }
+ }
SI F from_half(U16 h) {
#if defined(__AVX2__)
@@ -350,7 +387,7 @@
SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
- SI void load4(const void* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
+ SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
_23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
@@ -365,12 +402,19 @@
*b = unaligned_load<U16>((uint16_t*)&ba + 0);
*a = unaligned_load<U16>((uint16_t*)&ba + 4);
}
- SI void store4(const void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+ SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a));
_mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
_mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
}
+ SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
+ _MM_TRANSPOSE4_PS(r,g,b,a);
+ _mm_storeu_ps(ptr+ 0, r);
+ _mm_storeu_ps(ptr+ 4, g);
+ _mm_storeu_ps(ptr+ 8, b);
+ _mm_storeu_ps(ptr+12, a);
+ }
SI F from_half(U16 h) {
auto v = widen_cast<__m128i>(h);