From adc78d52024fc546df61aff091a0d522a8fb95f2 Mon Sep 17 00:00:00 2001 From: Mike Klein Date: Mon, 1 Jan 2018 09:06:37 -0500 Subject: fold SkJumper_vectors.h into SkJumper_stages.cpp This brings a little more symmetry to _stages.cpp and _stages_lowp.cpp. Change-Id: Icfcbd3f264ab97d8445ad8e14c25b4a07c780aea Reviewed-on: https://skia-review.googlesource.com/90030 Reviewed-by: Mike Klein Commit-Queue: Mike Klein --- src/jumper/SkJumper_misc.h | 2 +- src/jumper/SkJumper_stages.cpp | 694 +++++++++++++++++++++++++++++++++++++++- src/jumper/SkJumper_vectors.h | 700 ----------------------------------------- 3 files changed, 686 insertions(+), 710 deletions(-) delete mode 100644 src/jumper/SkJumper_vectors.h (limited to 'src/jumper') diff --git a/src/jumper/SkJumper_misc.h b/src/jumper/SkJumper_misc.h index 4aa88aafb0..4f3512417f 100644 --- a/src/jumper/SkJumper_misc.h +++ b/src/jumper/SkJumper_misc.h @@ -10,7 +10,7 @@ #include // for memcpy() -// Miscellany used by SkJumper_stages.cpp and SkJumper_vectors.h. +// Miscellany used by SkJumper_stages.cpp and SkJumper_stages_lowp.cpp. // Every function in this file should be marked static and inline using SI. #if defined(__clang__) diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index c7b165a8d6..bfc2cd0fec 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -6,15 +6,7 @@ */ #include "SkJumper.h" -#include "SkJumper_misc.h" // SI, unaligned_load(), bit_cast() -#include "SkJumper_vectors.h" // F, I32, U32, U16, U8, cast(), expand() - -// Our fundamental vector depth is our pixel stride. -static const size_t N = sizeof(F) / sizeof(float); - -// A reminder: -// When defined(JUMPER_IS_SCALAR), F, I32, etc. are normal scalar types and N is 1. -// When not, F, I32, etc. are N-deep Clang ext_vector_type vectors of the appropriate type. +#include "SkJumper_misc.h" // A little wrapper macro to name Stages differently depending on the instruction set. // That lets us link together several options. @@ -32,6 +24,690 @@ static const size_t N = sizeof(F) / sizeof(float); #define WRAP(name) sk_##name##_sse2 #endif +// Every function in this file should be marked static and inline using SI (see SkJumper_misc.h). + +#if !defined(__clang__) + #define JUMPER_IS_SCALAR +#elif defined(__ARM_NEON) + #define JUMPER_IS_NEON +#elif defined(__AVX512F__) + #define JUMPER_IS_AVX512 +#elif defined(__AVX2__) && defined(__F16C__) && defined(__FMA__) + #define JUMPER_IS_HSW +#elif defined(__AVX__) + #define JUMPER_IS_AVX +#elif defined(__SSE4_1__) + #define JUMPER_IS_SSE41 +#elif defined(__SSE2__) + #define JUMPER_IS_SSE2 +#else + #define JUMPER_IS_SCALAR +#endif + +// Older Clangs seem to crash when generating non-optimized NEON code for ARMv7. +#if defined(__clang__) && !defined(__OPTIMIZE__) && defined(__arm__) + // Apple Clang 9 and vanilla Clang 5 are fine, and may even be conservative. + #if defined(__apple_build_version__) && __clang_major__ < 9 + #define JUMPER_IS_SCALAR + #elif __clang_major__ < 5 + #define JUMPER_IS_SCALAR + #endif +#endif + +#if defined(JUMPER_IS_SCALAR) + // This path should lead to portable scalar code. + #include + + using F = float ; + using I32 = int32_t; + using U64 = uint64_t; + using U32 = uint32_t; + using U16 = uint16_t; + using U8 = uint8_t ; + + SI F mad(F f, F m, F a) { return f*m+a; } + SI F min(F a, F b) { return fminf(a,b); } + SI F max(F a, F b) { return fmaxf(a,b); } + SI F abs_ (F v) { return fabsf(v); } + SI F floor_(F v) { return floorf(v); } + SI F rcp (F v) { return 1.0f / v; } + SI F rsqrt (F v) { return 1.0f / sqrtf(v); } + SI F sqrt_(F v) { return sqrtf(v); } + SI U32 round (F v, F scale) { return (uint32_t)(v*scale + 0.5f); } + SI U16 pack(U32 v) { return (U16)v; } + SI U8 pack(U16 v) { return (U8)v; } + + SI F if_then_else(I32 c, F t, F e) { return c ? t : e; } + + template + SI T gather(const T* p, U32 ix) { return p[ix]; } + + SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { + *r = ptr[0]; + *g = ptr[1]; + *b = ptr[2]; + } + SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { + *r = ptr[0]; + *g = ptr[1]; + *b = ptr[2]; + *a = ptr[3]; + } + SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { + ptr[0] = r; + ptr[1] = g; + ptr[2] = b; + ptr[3] = a; + } + + SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { + *r = ptr[0]; + *g = ptr[1]; + *b = ptr[2]; + *a = ptr[3]; + } + SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { + ptr[0] = r; + ptr[1] = g; + ptr[2] = b; + ptr[3] = a; + } + +#elif defined(JUMPER_IS_NEON) + #include + + // Since we know we're using Clang, we can use its vector extensions. + template using V = T __attribute__((ext_vector_type(4))); + using F = V; + using I32 = V< int32_t>; + using U64 = V; + using U32 = V; + using U16 = V; + using U8 = V; + + // We polyfill a few routines that Clang doesn't build into ext_vector_types. + SI F min(F a, F b) { return vminq_f32(a,b); } + SI F max(F a, F b) { return vmaxq_f32(a,b); } + SI F abs_ (F v) { return vabsq_f32(v); } + SI F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; } + SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; } + SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); } + SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); } + + SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); } + + #if defined(__aarch64__) + SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); } + SI F floor_(F v) { return vrndmq_f32(v); } + SI F sqrt_(F v) { return vsqrtq_f32(v); } + SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); } + #else + SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); } + SI F floor_(F v) { + F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); + return roundtrip - if_then_else(roundtrip > v, 1, 0); + } + + SI F sqrt_(F v) { + auto e = vrsqrteq_f32(v); // Estimate and two refinement steps for e = rsqrt(v). + e *= vrsqrtsq_f32(v,e*e); + e *= vrsqrtsq_f32(v,e*e); + return v*e; // sqrt(v) == v*rsqrt(v). + } + + SI U32 round(F v, F scale) { + return vcvtq_u32_f32(mad(v,scale,0.5f)); + } + #endif + + + template + SI V gather(const T* p, U32 ix) { + return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; + } + + SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { + uint16x4x3_t rgb; + if (__builtin_expect(tail,0)) { + if ( true ) { rgb = vld3_lane_u16(ptr + 0, rgb, 0); } + if (tail > 1) { rgb = vld3_lane_u16(ptr + 3, rgb, 1); } + if (tail > 2) { rgb = vld3_lane_u16(ptr + 6, rgb, 2); } + } else { + rgb = vld3_u16(ptr); + } + *r = rgb.val[0]; + *g = rgb.val[1]; + *b = rgb.val[2]; + } + SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { + uint16x4x4_t rgba; + if (__builtin_expect(tail,0)) { + if ( true ) { rgba = vld4_lane_u16(ptr + 0, rgba, 0); } + if (tail > 1) { rgba = vld4_lane_u16(ptr + 4, rgba, 1); } + if (tail > 2) { rgba = vld4_lane_u16(ptr + 8, rgba, 2); } + } else { + rgba = vld4_u16(ptr); + } + *r = rgba.val[0]; + *g = rgba.val[1]; + *b = rgba.val[2]; + *a = rgba.val[3]; + } + SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { + if (__builtin_expect(tail,0)) { + if ( true ) { vst4_lane_u16(ptr + 0, (uint16x4x4_t{{r,g,b,a}}), 0); } + if (tail > 1) { vst4_lane_u16(ptr + 4, (uint16x4x4_t{{r,g,b,a}}), 1); } + if (tail > 2) { vst4_lane_u16(ptr + 8, (uint16x4x4_t{{r,g,b,a}}), 2); } + } else { + vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}})); + } + } + SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { + float32x4x4_t rgba; + if (__builtin_expect(tail,0)) { + if ( true ) { rgba = vld4q_lane_f32(ptr + 0, rgba, 0); } + if (tail > 1) { rgba = vld4q_lane_f32(ptr + 4, rgba, 1); } + if (tail > 2) { rgba = vld4q_lane_f32(ptr + 8, rgba, 2); } + } else { + rgba = vld4q_f32(ptr); + } + *r = rgba.val[0]; + *g = rgba.val[1]; + *b = rgba.val[2]; + *a = rgba.val[3]; + } + SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { + if (__builtin_expect(tail,0)) { + if ( true ) { vst4q_lane_f32(ptr + 0, (float32x4x4_t{{r,g,b,a}}), 0); } + if (tail > 1) { vst4q_lane_f32(ptr + 4, (float32x4x4_t{{r,g,b,a}}), 1); } + if (tail > 2) { vst4q_lane_f32(ptr + 8, (float32x4x4_t{{r,g,b,a}}), 2); } + } else { + vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}})); + } + } + +#elif defined(JUMPER_IS_AVX) || defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) + #include + + // These are __m256 and __m256i, but friendlier and strongly-typed. + template using V = T __attribute__((ext_vector_type(8))); + using F = V; + using I32 = V< int32_t>; + using U64 = V; + using U32 = V; + using U16 = V; + using U8 = V; + + SI F mad(F f, F m, F a) { + #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) + return _mm256_fmadd_ps(f,m,a); + #else + return f*m+a; + #endif + } + + SI F min(F a, F b) { return _mm256_min_ps(a,b); } + SI F max(F a, F b) { return _mm256_max_ps(a,b); } + SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); } + SI F floor_(F v) { return _mm256_floor_ps(v); } + SI F rcp (F v) { return _mm256_rcp_ps (v); } + SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); } + SI F sqrt_(F v) { return _mm256_sqrt_ps (v); } + SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); } + + SI U16 pack(U32 v) { + return _mm_packus_epi32(_mm256_extractf128_si256(v, 0), + _mm256_extractf128_si256(v, 1)); + } + SI U8 pack(U16 v) { + auto r = _mm_packus_epi16(v,v); + return unaligned_load(&r); + } + + SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } + + template + SI V gather(const T* p, U32 ix) { + return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]], + p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], }; + } + #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) + SI F gather(const float* p, U32 ix) { return _mm256_i32gather_ps (p, ix, 4); } + SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, 4); } + SI U64 gather(const uint64_t* p, U32 ix) { + __m256i parts[] = { + _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,0), 8), + _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,1), 8), + }; + return bit_cast(parts); + } + #endif + + SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { + __m128i _0,_1,_2,_3,_4,_5,_6,_7; + if (__builtin_expect(tail,0)) { + auto load_rgb = [](const uint16_t* src) { + auto v = _mm_cvtsi32_si128(*(const uint32_t*)src); + return _mm_insert_epi16(v, src[2], 2); + }; + _1 = _2 = _3 = _4 = _5 = _6 = _7 = _mm_setzero_si128(); + if ( true ) { _0 = load_rgb(ptr + 0); } + if (tail > 1) { _1 = load_rgb(ptr + 3); } + if (tail > 2) { _2 = load_rgb(ptr + 6); } + if (tail > 3) { _3 = load_rgb(ptr + 9); } + if (tail > 4) { _4 = load_rgb(ptr + 12); } + if (tail > 5) { _5 = load_rgb(ptr + 15); } + if (tail > 6) { _6 = load_rgb(ptr + 18); } + } else { + // Load 0+1, 2+3, 4+5 normally, and 6+7 backed up 4 bytes so we don't run over. + auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) ; + auto _23 = _mm_loadu_si128((const __m128i*)(ptr + 6)) ; + auto _45 = _mm_loadu_si128((const __m128i*)(ptr + 12)) ; + auto _67 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 16)), 4); + _0 = _01; _1 = _mm_srli_si128(_01, 6); + _2 = _23; _3 = _mm_srli_si128(_23, 6); + _4 = _45; _5 = _mm_srli_si128(_45, 6); + _6 = _67; _7 = _mm_srli_si128(_67, 6); + } + + auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx + _13 = _mm_unpacklo_epi16(_1, _3), + _46 = _mm_unpacklo_epi16(_4, _6), + _57 = _mm_unpacklo_epi16(_5, _7); + + auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 + bx0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 xx xx xx xx + rg4567 = _mm_unpacklo_epi16(_46, _57), + bx4567 = _mm_unpackhi_epi16(_46, _57); + + *r = _mm_unpacklo_epi64(rg0123, rg4567); + *g = _mm_unpackhi_epi64(rg0123, rg4567); + *b = _mm_unpacklo_epi64(bx0123, bx4567); + } + SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { + __m128i _01, _23, _45, _67; + if (__builtin_expect(tail,0)) { + auto src = (const double*)ptr; + _01 = _23 = _45 = _67 = _mm_setzero_si128(); + if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); } + if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); } + if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); } + if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); } + if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); } + if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); } + if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); } + } else { + _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); + _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); + _45 = _mm_loadu_si128(((__m128i*)ptr) + 2); + _67 = _mm_loadu_si128(((__m128i*)ptr) + 3); + } + + auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 + _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3 + _46 = _mm_unpacklo_epi16(_45, _67), + _57 = _mm_unpackhi_epi16(_45, _67); + + auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 + ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3 + rg4567 = _mm_unpacklo_epi16(_46, _57), + ba4567 = _mm_unpackhi_epi16(_46, _57); + + *r = _mm_unpacklo_epi64(rg0123, rg4567); + *g = _mm_unpackhi_epi64(rg0123, rg4567); + *b = _mm_unpacklo_epi64(ba0123, ba4567); + *a = _mm_unpackhi_epi64(ba0123, ba4567); + } + SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { + auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3 + rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7 + ba0123 = _mm_unpacklo_epi16(b, a), + ba4567 = _mm_unpackhi_epi16(b, a); + + auto _01 = _mm_unpacklo_epi32(rg0123, ba0123), + _23 = _mm_unpackhi_epi32(rg0123, ba0123), + _45 = _mm_unpacklo_epi32(rg4567, ba4567), + _67 = _mm_unpackhi_epi32(rg4567, ba4567); + + if (__builtin_expect(tail,0)) { + auto dst = (double*)ptr; + if (tail > 0) { _mm_storel_pd(dst+0, _01); } + if (tail > 1) { _mm_storeh_pd(dst+1, _01); } + if (tail > 2) { _mm_storel_pd(dst+2, _23); } + if (tail > 3) { _mm_storeh_pd(dst+3, _23); } + if (tail > 4) { _mm_storel_pd(dst+4, _45); } + if (tail > 5) { _mm_storeh_pd(dst+5, _45); } + if (tail > 6) { _mm_storel_pd(dst+6, _67); } + } else { + _mm_storeu_si128((__m128i*)ptr + 0, _01); + _mm_storeu_si128((__m128i*)ptr + 1, _23); + _mm_storeu_si128((__m128i*)ptr + 2, _45); + _mm_storeu_si128((__m128i*)ptr + 3, _67); + } + } + + SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { + F _04, _15, _26, _37; + _04 = _15 = _26 = _37 = 0; + switch (tail) { + case 0: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1); + case 7: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1); + case 6: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1); + case 5: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1); + case 4: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+12), 0); + case 3: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ 8), 0); + case 2: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ 4), 0); + case 1: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ 0), 0); + } + + F rg0145 = _mm256_unpacklo_ps(_04,_15), // r0 r1 g0 g1 | r4 r5 g4 g5 + ba0145 = _mm256_unpackhi_ps(_04,_15), + rg2367 = _mm256_unpacklo_ps(_26,_37), + ba2367 = _mm256_unpackhi_ps(_26,_37); + + *r = _mm256_unpacklo_pd(rg0145, rg2367); + *g = _mm256_unpackhi_pd(rg0145, rg2367); + *b = _mm256_unpacklo_pd(ba0145, ba2367); + *a = _mm256_unpackhi_pd(ba0145, ba2367); + } + SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { + F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5 + rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ... + ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5 + ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ... + + F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4 + _15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ... + _26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ... + _37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ... + + if (__builtin_expect(tail, 0)) { + if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); } + if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); } + if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); } + if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); } + if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); } + if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); } + if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); } + } else { + F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo + _23 = _mm256_permute2f128_ps(_26, _37, 32), + _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi + _67 = _mm256_permute2f128_ps(_26, _37, 49); + _mm256_storeu_ps(ptr+ 0, _01); + _mm256_storeu_ps(ptr+ 8, _23); + _mm256_storeu_ps(ptr+16, _45); + _mm256_storeu_ps(ptr+24, _67); + } + } + +#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) + #include + + template using V = T __attribute__((ext_vector_type(4))); + using F = V; + using I32 = V< int32_t>; + using U64 = V; + using U32 = V; + using U16 = V; + using U8 = V; + + SI F mad(F f, F m, F a) { return f*m+a; } + SI F min(F a, F b) { return _mm_min_ps(a,b); } + SI F max(F a, F b) { return _mm_max_ps(a,b); } + SI F abs_(F v) { return _mm_and_ps(v, 0-v); } + SI F rcp (F v) { return _mm_rcp_ps (v); } + SI F rsqrt (F v) { return _mm_rsqrt_ps(v); } + SI F sqrt_(F v) { return _mm_sqrt_ps (v); } + SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); } + + SI U16 pack(U32 v) { + #if defined(JUMPER_IS_SSE41) + auto p = _mm_packus_epi32(v,v); + #else + // Sign extend so that _mm_packs_epi32() does the pack we want. + auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16); + p = _mm_packs_epi32(p,p); + #endif + return unaligned_load(&p); // We have two copies. Return (the lower) one. + } + SI U8 pack(U16 v) { + auto r = widen_cast<__m128i>(v); + r = _mm_packus_epi16(r,r); + return unaligned_load(&r); + } + + SI F if_then_else(I32 c, F t, F e) { + return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e)); + } + + SI F floor_(F v) { + #if defined(JUMPER_IS_SSE41) + return _mm_floor_ps(v); + #else + F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v)); + return roundtrip - if_then_else(roundtrip > v, 1, 0); + #endif + } + + template + SI V gather(const T* p, U32 ix) { + return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; + } + + SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { + __m128i _0, _1, _2, _3; + if (__builtin_expect(tail,0)) { + _1 = _2 = _3 = _mm_setzero_si128(); + auto load_rgb = [](const uint16_t* src) { + auto v = _mm_cvtsi32_si128(*(const uint32_t*)src); + return _mm_insert_epi16(v, src[2], 2); + }; + if ( true ) { _0 = load_rgb(ptr + 0); } + if (tail > 1) { _1 = load_rgb(ptr + 3); } + if (tail > 2) { _2 = load_rgb(ptr + 6); } + } else { + // Load slightly weirdly to make sure we don't load past the end of 4x48 bits. + auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) , + _23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4); + + // Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored). + _0 = _01; + _1 = _mm_srli_si128(_01, 6); + _2 = _23; + _3 = _mm_srli_si128(_23, 6); + } + + // De-interlace to R,G,B. + auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx + _13 = _mm_unpacklo_epi16(_1, _3); // r1 r3 g1 g3 b1 b3 xx xx + + auto R = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 + G = _mm_srli_si128(R, 8), + B = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 xx xx xx xx + + *r = unaligned_load(&R); + *g = unaligned_load(&G); + *b = unaligned_load(&B); + } + + SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { + __m128i _01, _23; + if (__builtin_expect(tail,0)) { + _01 = _23 = _mm_setzero_si128(); + auto src = (const double*)ptr; + if ( true ) { _01 = _mm_loadl_pd(_01, src + 0); } // r0 g0 b0 a0 00 00 00 00 + if (tail > 1) { _01 = _mm_loadh_pd(_01, src + 1); } // r0 g0 b0 a0 r1 g1 b1 a1 + if (tail > 2) { _23 = _mm_loadl_pd(_23, src + 2); } // r2 g2 b2 a2 00 00 00 00 + } else { + _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 b0 a0 r1 g1 b1 a1 + _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3 + } + + auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 + _13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3 + + auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 + ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3 + + *r = unaligned_load((uint16_t*)&rg + 0); + *g = unaligned_load((uint16_t*)&rg + 4); + *b = unaligned_load((uint16_t*)&ba + 0); + *a = unaligned_load((uint16_t*)&ba + 4); + } + + SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { + auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)), + ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a)); + + if (__builtin_expect(tail, 0)) { + auto dst = (double*)ptr; + if ( true ) { _mm_storel_pd(dst + 0, _mm_unpacklo_epi32(rg, ba)); } + if (tail > 1) { _mm_storeh_pd(dst + 1, _mm_unpacklo_epi32(rg, ba)); } + if (tail > 2) { _mm_storel_pd(dst + 2, _mm_unpackhi_epi32(rg, ba)); } + } else { + _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba)); + _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba)); + } + } + + SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { + F _0, _1, _2, _3; + if (__builtin_expect(tail, 0)) { + _1 = _2 = _3 = _mm_setzero_si128(); + if ( true ) { _0 = _mm_loadu_ps(ptr + 0); } + if (tail > 1) { _1 = _mm_loadu_ps(ptr + 4); } + if (tail > 2) { _2 = _mm_loadu_ps(ptr + 8); } + } else { + _0 = _mm_loadu_ps(ptr + 0); + _1 = _mm_loadu_ps(ptr + 4); + _2 = _mm_loadu_ps(ptr + 8); + _3 = _mm_loadu_ps(ptr +12); + } + _MM_TRANSPOSE4_PS(_0,_1,_2,_3); + *r = _0; + *g = _1; + *b = _2; + *a = _3; + } + + SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { + _MM_TRANSPOSE4_PS(r,g,b,a); + if (__builtin_expect(tail, 0)) { + if ( true ) { _mm_storeu_ps(ptr + 0, r); } + if (tail > 1) { _mm_storeu_ps(ptr + 4, g); } + if (tail > 2) { _mm_storeu_ps(ptr + 8, b); } + } else { + _mm_storeu_ps(ptr + 0, r); + _mm_storeu_ps(ptr + 4, g); + _mm_storeu_ps(ptr + 8, b); + _mm_storeu_ps(ptr +12, a); + } + } +#endif + +// We need to be a careful with casts. +// (F)x means cast x to float in the portable path, but bit_cast x to float in the others. +// These named casts and bit_cast() are always what they seem to be. +#if defined(JUMPER_IS_SCALAR) + SI F cast (U32 v) { return (F)v; } + SI U32 trunc_(F v) { return (U32)v; } + SI U32 expand(U16 v) { return (U32)v; } + SI U32 expand(U8 v) { return (U32)v; } +#else + SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); } + SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); } + SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); } + SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); } +#endif + +template +SI V if_then_else(I32 c, V t, V e) { + return bit_cast(if_then_else(c, bit_cast(t), bit_cast(e))); +} + +SI U16 bswap(U16 x) { +#if defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) + // Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes + // when generating code for SSE2 and SSE4.1. We'll do it manually... + auto v = widen_cast<__m128i>(x); + v = _mm_slli_epi16(v,8) | _mm_srli_epi16(v,8); + return unaligned_load(&v); +#else + return (x<<8) | (x>>8); +#endif +} + +SI F fract(F v) { return v - floor_(v); } + +// See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html. +SI F approx_log2(F x) { + // e - 127 is a fair approximation of log2(x) in its own right... + F e = cast(bit_cast(x)) * (1.0f / (1<<23)); + + // ... but using the mantissa to refine its error is _much_ better. + F m = bit_cast((bit_cast(x) & 0x007fffff) | 0x3f000000); + return e + - 124.225514990f + - 1.498030302f * m + - 1.725879990f / (0.3520887068f + m); +} +SI F approx_pow2(F x) { + F f = fract(x); + return bit_cast(round(1.0f * (1<<23), + x + 121.274057500f + - 1.490129070f * f + + 27.728023300f / (4.84252568f - f))); +} + +SI F approx_powf(F x, F y) { + return if_then_else(x == 0, 0 + , approx_pow2(approx_log2(x) * y)); +} + +SI F from_half(U16 h) { +#if defined(__aarch64__) && !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds. + return vcvt_f32_f16(h); + +#elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) + return _mm256_cvtph_ps(h); + +#else + // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias. + U32 sem = expand(h), + s = sem & 0x8000, + em = sem ^ s; + + // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero. + auto denorm = (I32)em < 0x0400; // I32 comparison is often quicker, and always safe here. + return if_then_else(denorm, F(0) + , bit_cast( (s<<16) + (em<<13) + ((127-15)<<23) )); +#endif +} + +SI U16 to_half(F f) { +#if defined(__aarch64__) && !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds. + return vcvt_f16_f32(f); + +#elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) + return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION); + +#else + // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias. + U32 sem = bit_cast(f), + s = sem & 0x80000000, + em = sem ^ s; + + // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero. + auto denorm = (I32)em < 0x38800000; // I32 comparison is often quicker, and always safe here. + return pack(if_then_else(denorm, U32(0) + , (s>>16) + (em>>13) - ((127-15)<<10))); +#endif +} + +// Our fundamental vector depth is our pixel stride. +static const size_t N = sizeof(F) / sizeof(float); + // We're finally going to get to what a Stage function looks like! // tail == 0 ~~> work on a full N pixels // tail != 0 ~~> work on only the first tail pixels diff --git a/src/jumper/SkJumper_vectors.h b/src/jumper/SkJumper_vectors.h deleted file mode 100644 index e7919f75fc..0000000000 --- a/src/jumper/SkJumper_vectors.h +++ /dev/null @@ -1,700 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -#ifndef SkJumper_vectors_DEFINED -#define SkJumper_vectors_DEFINED - -#include "SkJumper.h" -#include "SkJumper_misc.h" -#include - -// This file contains vector types that SkJumper_stages.cpp uses to define stages. - -// Every function in this file should be marked static and inline using SI (see SkJumper_misc.h). - -#if !defined(__clang__) - #define JUMPER_IS_SCALAR -#elif defined(__ARM_NEON) - #define JUMPER_IS_NEON -#elif defined(__AVX512F__) - #define JUMPER_IS_AVX512 -#elif defined(__AVX2__) && defined(__F16C__) && defined(__FMA__) - #define JUMPER_IS_HSW -#elif defined(__AVX__) - #define JUMPER_IS_AVX -#elif defined(__SSE4_1__) - #define JUMPER_IS_SSE41 -#elif defined(__SSE2__) - #define JUMPER_IS_SSE2 -#else - #define JUMPER_IS_SCALAR -#endif - -// Older Clangs seem to crash when generating non-optimized NEON code for ARMv7. -#if defined(__clang__) && !defined(__OPTIMIZE__) && defined(__arm__) - // Apple Clang 9 and vanilla Clang 5 are fine, and may even be conservative. - #if defined(__apple_build_version__) && __clang_major__ < 9 - #define JUMPER_IS_SCALAR - #elif __clang_major__ < 5 - #define JUMPER_IS_SCALAR - #endif -#endif - -#if defined(JUMPER_IS_SCALAR) - // This path should lead to portable scalar code. - #include - - using F = float ; - using I32 = int32_t; - using U64 = uint64_t; - using U32 = uint32_t; - using U16 = uint16_t; - using U8 = uint8_t ; - - SI F mad(F f, F m, F a) { return f*m+a; } - SI F min(F a, F b) { return fminf(a,b); } - SI F max(F a, F b) { return fmaxf(a,b); } - SI F abs_ (F v) { return fabsf(v); } - SI F floor_(F v) { return floorf(v); } - SI F rcp (F v) { return 1.0f / v; } - SI F rsqrt (F v) { return 1.0f / sqrtf(v); } - SI F sqrt_(F v) { return sqrtf(v); } - SI U32 round (F v, F scale) { return (uint32_t)(v*scale + 0.5f); } - SI U16 pack(U32 v) { return (U16)v; } - SI U8 pack(U16 v) { return (U8)v; } - - SI F if_then_else(I32 c, F t, F e) { return c ? t : e; } - - template - SI T gather(const T* p, U32 ix) { return p[ix]; } - - SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { - *r = ptr[0]; - *g = ptr[1]; - *b = ptr[2]; - } - SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { - *r = ptr[0]; - *g = ptr[1]; - *b = ptr[2]; - *a = ptr[3]; - } - SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { - ptr[0] = r; - ptr[1] = g; - ptr[2] = b; - ptr[3] = a; - } - - SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { - *r = ptr[0]; - *g = ptr[1]; - *b = ptr[2]; - *a = ptr[3]; - } - SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { - ptr[0] = r; - ptr[1] = g; - ptr[2] = b; - ptr[3] = a; - } - -#elif defined(JUMPER_IS_NEON) - #include - - // Since we know we're using Clang, we can use its vector extensions. - template using V = T __attribute__((ext_vector_type(4))); - using F = V; - using I32 = V< int32_t>; - using U64 = V; - using U32 = V; - using U16 = V; - using U8 = V; - - // We polyfill a few routines that Clang doesn't build into ext_vector_types. - SI F min(F a, F b) { return vminq_f32(a,b); } - SI F max(F a, F b) { return vmaxq_f32(a,b); } - SI F abs_ (F v) { return vabsq_f32(v); } - SI F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; } - SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; } - SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); } - SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); } - - SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); } - - #if defined(__aarch64__) - SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); } - SI F floor_(F v) { return vrndmq_f32(v); } - SI F sqrt_(F v) { return vsqrtq_f32(v); } - SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); } - #else - SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); } - SI F floor_(F v) { - F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); - return roundtrip - if_then_else(roundtrip > v, 1, 0); - } - - SI F sqrt_(F v) { - auto e = vrsqrteq_f32(v); // Estimate and two refinement steps for e = rsqrt(v). - e *= vrsqrtsq_f32(v,e*e); - e *= vrsqrtsq_f32(v,e*e); - return v*e; // sqrt(v) == v*rsqrt(v). - } - - SI U32 round(F v, F scale) { - return vcvtq_u32_f32(mad(v,scale,0.5f)); - } - #endif - - - template - SI V gather(const T* p, U32 ix) { - return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; - } - - SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { - uint16x4x3_t rgb; - if (__builtin_expect(tail,0)) { - if ( true ) { rgb = vld3_lane_u16(ptr + 0, rgb, 0); } - if (tail > 1) { rgb = vld3_lane_u16(ptr + 3, rgb, 1); } - if (tail > 2) { rgb = vld3_lane_u16(ptr + 6, rgb, 2); } - } else { - rgb = vld3_u16(ptr); - } - *r = rgb.val[0]; - *g = rgb.val[1]; - *b = rgb.val[2]; - } - SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { - uint16x4x4_t rgba; - if (__builtin_expect(tail,0)) { - if ( true ) { rgba = vld4_lane_u16(ptr + 0, rgba, 0); } - if (tail > 1) { rgba = vld4_lane_u16(ptr + 4, rgba, 1); } - if (tail > 2) { rgba = vld4_lane_u16(ptr + 8, rgba, 2); } - } else { - rgba = vld4_u16(ptr); - } - *r = rgba.val[0]; - *g = rgba.val[1]; - *b = rgba.val[2]; - *a = rgba.val[3]; - } - SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { - if (__builtin_expect(tail,0)) { - if ( true ) { vst4_lane_u16(ptr + 0, (uint16x4x4_t{{r,g,b,a}}), 0); } - if (tail > 1) { vst4_lane_u16(ptr + 4, (uint16x4x4_t{{r,g,b,a}}), 1); } - if (tail > 2) { vst4_lane_u16(ptr + 8, (uint16x4x4_t{{r,g,b,a}}), 2); } - } else { - vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}})); - } - } - SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { - float32x4x4_t rgba; - if (__builtin_expect(tail,0)) { - if ( true ) { rgba = vld4q_lane_f32(ptr + 0, rgba, 0); } - if (tail > 1) { rgba = vld4q_lane_f32(ptr + 4, rgba, 1); } - if (tail > 2) { rgba = vld4q_lane_f32(ptr + 8, rgba, 2); } - } else { - rgba = vld4q_f32(ptr); - } - *r = rgba.val[0]; - *g = rgba.val[1]; - *b = rgba.val[2]; - *a = rgba.val[3]; - } - SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { - if (__builtin_expect(tail,0)) { - if ( true ) { vst4q_lane_f32(ptr + 0, (float32x4x4_t{{r,g,b,a}}), 0); } - if (tail > 1) { vst4q_lane_f32(ptr + 4, (float32x4x4_t{{r,g,b,a}}), 1); } - if (tail > 2) { vst4q_lane_f32(ptr + 8, (float32x4x4_t{{r,g,b,a}}), 2); } - } else { - vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}})); - } - } - -#elif defined(JUMPER_IS_AVX) || defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) - #include - - // These are __m256 and __m256i, but friendlier and strongly-typed. - template using V = T __attribute__((ext_vector_type(8))); - using F = V; - using I32 = V< int32_t>; - using U64 = V; - using U32 = V; - using U16 = V; - using U8 = V; - - SI F mad(F f, F m, F a) { - #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) - return _mm256_fmadd_ps(f,m,a); - #else - return f*m+a; - #endif - } - - SI F min(F a, F b) { return _mm256_min_ps(a,b); } - SI F max(F a, F b) { return _mm256_max_ps(a,b); } - SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); } - SI F floor_(F v) { return _mm256_floor_ps(v); } - SI F rcp (F v) { return _mm256_rcp_ps (v); } - SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); } - SI F sqrt_(F v) { return _mm256_sqrt_ps (v); } - SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); } - - SI U16 pack(U32 v) { - return _mm_packus_epi32(_mm256_extractf128_si256(v, 0), - _mm256_extractf128_si256(v, 1)); - } - SI U8 pack(U16 v) { - auto r = _mm_packus_epi16(v,v); - return unaligned_load(&r); - } - - SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } - - template - SI V gather(const T* p, U32 ix) { - return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]], - p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], }; - } - #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) - SI F gather(const float* p, U32 ix) { return _mm256_i32gather_ps (p, ix, 4); } - SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, 4); } - SI U64 gather(const uint64_t* p, U32 ix) { - __m256i parts[] = { - _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,0), 8), - _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,1), 8), - }; - return bit_cast(parts); - } - #endif - - SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { - __m128i _0,_1,_2,_3,_4,_5,_6,_7; - if (__builtin_expect(tail,0)) { - auto load_rgb = [](const uint16_t* src) { - auto v = _mm_cvtsi32_si128(*(const uint32_t*)src); - return _mm_insert_epi16(v, src[2], 2); - }; - _1 = _2 = _3 = _4 = _5 = _6 = _7 = _mm_setzero_si128(); - if ( true ) { _0 = load_rgb(ptr + 0); } - if (tail > 1) { _1 = load_rgb(ptr + 3); } - if (tail > 2) { _2 = load_rgb(ptr + 6); } - if (tail > 3) { _3 = load_rgb(ptr + 9); } - if (tail > 4) { _4 = load_rgb(ptr + 12); } - if (tail > 5) { _5 = load_rgb(ptr + 15); } - if (tail > 6) { _6 = load_rgb(ptr + 18); } - } else { - // Load 0+1, 2+3, 4+5 normally, and 6+7 backed up 4 bytes so we don't run over. - auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) ; - auto _23 = _mm_loadu_si128((const __m128i*)(ptr + 6)) ; - auto _45 = _mm_loadu_si128((const __m128i*)(ptr + 12)) ; - auto _67 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 16)), 4); - _0 = _01; _1 = _mm_srli_si128(_01, 6); - _2 = _23; _3 = _mm_srli_si128(_23, 6); - _4 = _45; _5 = _mm_srli_si128(_45, 6); - _6 = _67; _7 = _mm_srli_si128(_67, 6); - } - - auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx - _13 = _mm_unpacklo_epi16(_1, _3), - _46 = _mm_unpacklo_epi16(_4, _6), - _57 = _mm_unpacklo_epi16(_5, _7); - - auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 - bx0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 xx xx xx xx - rg4567 = _mm_unpacklo_epi16(_46, _57), - bx4567 = _mm_unpackhi_epi16(_46, _57); - - *r = _mm_unpacklo_epi64(rg0123, rg4567); - *g = _mm_unpackhi_epi64(rg0123, rg4567); - *b = _mm_unpacklo_epi64(bx0123, bx4567); - } - SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { - __m128i _01, _23, _45, _67; - if (__builtin_expect(tail,0)) { - auto src = (const double*)ptr; - _01 = _23 = _45 = _67 = _mm_setzero_si128(); - if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); } - if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); } - if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); } - if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); } - if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); } - if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); } - if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); } - } else { - _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); - _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); - _45 = _mm_loadu_si128(((__m128i*)ptr) + 2); - _67 = _mm_loadu_si128(((__m128i*)ptr) + 3); - } - - auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 - _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3 - _46 = _mm_unpacklo_epi16(_45, _67), - _57 = _mm_unpackhi_epi16(_45, _67); - - auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 - ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3 - rg4567 = _mm_unpacklo_epi16(_46, _57), - ba4567 = _mm_unpackhi_epi16(_46, _57); - - *r = _mm_unpacklo_epi64(rg0123, rg4567); - *g = _mm_unpackhi_epi64(rg0123, rg4567); - *b = _mm_unpacklo_epi64(ba0123, ba4567); - *a = _mm_unpackhi_epi64(ba0123, ba4567); - } - SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { - auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3 - rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7 - ba0123 = _mm_unpacklo_epi16(b, a), - ba4567 = _mm_unpackhi_epi16(b, a); - - auto _01 = _mm_unpacklo_epi32(rg0123, ba0123), - _23 = _mm_unpackhi_epi32(rg0123, ba0123), - _45 = _mm_unpacklo_epi32(rg4567, ba4567), - _67 = _mm_unpackhi_epi32(rg4567, ba4567); - - if (__builtin_expect(tail,0)) { - auto dst = (double*)ptr; - if (tail > 0) { _mm_storel_pd(dst+0, _01); } - if (tail > 1) { _mm_storeh_pd(dst+1, _01); } - if (tail > 2) { _mm_storel_pd(dst+2, _23); } - if (tail > 3) { _mm_storeh_pd(dst+3, _23); } - if (tail > 4) { _mm_storel_pd(dst+4, _45); } - if (tail > 5) { _mm_storeh_pd(dst+5, _45); } - if (tail > 6) { _mm_storel_pd(dst+6, _67); } - } else { - _mm_storeu_si128((__m128i*)ptr + 0, _01); - _mm_storeu_si128((__m128i*)ptr + 1, _23); - _mm_storeu_si128((__m128i*)ptr + 2, _45); - _mm_storeu_si128((__m128i*)ptr + 3, _67); - } - } - - SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { - F _04, _15, _26, _37; - _04 = _15 = _26 = _37 = 0; - switch (tail) { - case 0: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1); - case 7: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1); - case 6: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1); - case 5: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1); - case 4: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+12), 0); - case 3: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ 8), 0); - case 2: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ 4), 0); - case 1: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ 0), 0); - } - - F rg0145 = _mm256_unpacklo_ps(_04,_15), // r0 r1 g0 g1 | r4 r5 g4 g5 - ba0145 = _mm256_unpackhi_ps(_04,_15), - rg2367 = _mm256_unpacklo_ps(_26,_37), - ba2367 = _mm256_unpackhi_ps(_26,_37); - - *r = _mm256_unpacklo_pd(rg0145, rg2367); - *g = _mm256_unpackhi_pd(rg0145, rg2367); - *b = _mm256_unpacklo_pd(ba0145, ba2367); - *a = _mm256_unpackhi_pd(ba0145, ba2367); - } - SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { - F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5 - rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ... - ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5 - ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ... - - F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4 - _15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ... - _26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ... - _37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ... - - if (__builtin_expect(tail, 0)) { - if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); } - if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); } - if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); } - if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); } - if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); } - if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); } - if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); } - } else { - F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo - _23 = _mm256_permute2f128_ps(_26, _37, 32), - _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi - _67 = _mm256_permute2f128_ps(_26, _37, 49); - _mm256_storeu_ps(ptr+ 0, _01); - _mm256_storeu_ps(ptr+ 8, _23); - _mm256_storeu_ps(ptr+16, _45); - _mm256_storeu_ps(ptr+24, _67); - } - } - -#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) - #include - - template using V = T __attribute__((ext_vector_type(4))); - using F = V; - using I32 = V< int32_t>; - using U64 = V; - using U32 = V; - using U16 = V; - using U8 = V; - - SI F mad(F f, F m, F a) { return f*m+a; } - SI F min(F a, F b) { return _mm_min_ps(a,b); } - SI F max(F a, F b) { return _mm_max_ps(a,b); } - SI F abs_(F v) { return _mm_and_ps(v, 0-v); } - SI F rcp (F v) { return _mm_rcp_ps (v); } - SI F rsqrt (F v) { return _mm_rsqrt_ps(v); } - SI F sqrt_(F v) { return _mm_sqrt_ps (v); } - SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); } - - SI U16 pack(U32 v) { - #if defined(JUMPER_IS_SSE41) - auto p = _mm_packus_epi32(v,v); - #else - // Sign extend so that _mm_packs_epi32() does the pack we want. - auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16); - p = _mm_packs_epi32(p,p); - #endif - return unaligned_load(&p); // We have two copies. Return (the lower) one. - } - SI U8 pack(U16 v) { - auto r = widen_cast<__m128i>(v); - r = _mm_packus_epi16(r,r); - return unaligned_load(&r); - } - - SI F if_then_else(I32 c, F t, F e) { - return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e)); - } - - SI F floor_(F v) { - #if defined(JUMPER_IS_SSE41) - return _mm_floor_ps(v); - #else - F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v)); - return roundtrip - if_then_else(roundtrip > v, 1, 0); - #endif - } - - template - SI V gather(const T* p, U32 ix) { - return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; - } - - SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { - __m128i _0, _1, _2, _3; - if (__builtin_expect(tail,0)) { - _1 = _2 = _3 = _mm_setzero_si128(); - auto load_rgb = [](const uint16_t* src) { - auto v = _mm_cvtsi32_si128(*(const uint32_t*)src); - return _mm_insert_epi16(v, src[2], 2); - }; - if ( true ) { _0 = load_rgb(ptr + 0); } - if (tail > 1) { _1 = load_rgb(ptr + 3); } - if (tail > 2) { _2 = load_rgb(ptr + 6); } - } else { - // Load slightly weirdly to make sure we don't load past the end of 4x48 bits. - auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) , - _23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4); - - // Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored). - _0 = _01; - _1 = _mm_srli_si128(_01, 6); - _2 = _23; - _3 = _mm_srli_si128(_23, 6); - } - - // De-interlace to R,G,B. - auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx - _13 = _mm_unpacklo_epi16(_1, _3); // r1 r3 g1 g3 b1 b3 xx xx - - auto R = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 - G = _mm_srli_si128(R, 8), - B = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 xx xx xx xx - - *r = unaligned_load(&R); - *g = unaligned_load(&G); - *b = unaligned_load(&B); - } - - SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { - __m128i _01, _23; - if (__builtin_expect(tail,0)) { - _01 = _23 = _mm_setzero_si128(); - auto src = (const double*)ptr; - if ( true ) { _01 = _mm_loadl_pd(_01, src + 0); } // r0 g0 b0 a0 00 00 00 00 - if (tail > 1) { _01 = _mm_loadh_pd(_01, src + 1); } // r0 g0 b0 a0 r1 g1 b1 a1 - if (tail > 2) { _23 = _mm_loadl_pd(_23, src + 2); } // r2 g2 b2 a2 00 00 00 00 - } else { - _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 b0 a0 r1 g1 b1 a1 - _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3 - } - - auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 - _13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3 - - auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 - ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3 - - *r = unaligned_load((uint16_t*)&rg + 0); - *g = unaligned_load((uint16_t*)&rg + 4); - *b = unaligned_load((uint16_t*)&ba + 0); - *a = unaligned_load((uint16_t*)&ba + 4); - } - - SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { - auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)), - ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a)); - - if (__builtin_expect(tail, 0)) { - auto dst = (double*)ptr; - if ( true ) { _mm_storel_pd(dst + 0, _mm_unpacklo_epi32(rg, ba)); } - if (tail > 1) { _mm_storeh_pd(dst + 1, _mm_unpacklo_epi32(rg, ba)); } - if (tail > 2) { _mm_storel_pd(dst + 2, _mm_unpackhi_epi32(rg, ba)); } - } else { - _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba)); - _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba)); - } - } - - SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { - F _0, _1, _2, _3; - if (__builtin_expect(tail, 0)) { - _1 = _2 = _3 = _mm_setzero_si128(); - if ( true ) { _0 = _mm_loadu_ps(ptr + 0); } - if (tail > 1) { _1 = _mm_loadu_ps(ptr + 4); } - if (tail > 2) { _2 = _mm_loadu_ps(ptr + 8); } - } else { - _0 = _mm_loadu_ps(ptr + 0); - _1 = _mm_loadu_ps(ptr + 4); - _2 = _mm_loadu_ps(ptr + 8); - _3 = _mm_loadu_ps(ptr +12); - } - _MM_TRANSPOSE4_PS(_0,_1,_2,_3); - *r = _0; - *g = _1; - *b = _2; - *a = _3; - } - - SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { - _MM_TRANSPOSE4_PS(r,g,b,a); - if (__builtin_expect(tail, 0)) { - if ( true ) { _mm_storeu_ps(ptr + 0, r); } - if (tail > 1) { _mm_storeu_ps(ptr + 4, g); } - if (tail > 2) { _mm_storeu_ps(ptr + 8, b); } - } else { - _mm_storeu_ps(ptr + 0, r); - _mm_storeu_ps(ptr + 4, g); - _mm_storeu_ps(ptr + 8, b); - _mm_storeu_ps(ptr +12, a); - } - } -#endif - -// We need to be a careful with casts. -// (F)x means cast x to float in the portable path, but bit_cast x to float in the others. -// These named casts and bit_cast() are always what they seem to be. -#if defined(JUMPER_IS_SCALAR) - SI F cast (U32 v) { return (F)v; } - SI U32 trunc_(F v) { return (U32)v; } - SI U32 expand(U16 v) { return (U32)v; } - SI U32 expand(U8 v) { return (U32)v; } -#else - SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); } - SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); } - SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); } - SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); } -#endif - -template -SI V if_then_else(I32 c, V t, V e) { - return bit_cast(if_then_else(c, bit_cast(t), bit_cast(e))); -} - -SI U16 bswap(U16 x) { -#if defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) - // Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes - // when generating code for SSE2 and SSE4.1. We'll do it manually... - auto v = widen_cast<__m128i>(x); - v = _mm_slli_epi16(v,8) | _mm_srli_epi16(v,8); - return unaligned_load(&v); -#else - return (x<<8) | (x>>8); -#endif -} - -SI F fract(F v) { return v - floor_(v); } - -// See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html. -SI F approx_log2(F x) { - // e - 127 is a fair approximation of log2(x) in its own right... - F e = cast(bit_cast(x)) * (1.0f / (1<<23)); - - // ... but using the mantissa to refine its error is _much_ better. - F m = bit_cast((bit_cast(x) & 0x007fffff) | 0x3f000000); - return e - - 124.225514990f - - 1.498030302f * m - - 1.725879990f / (0.3520887068f + m); -} -SI F approx_pow2(F x) { - F f = fract(x); - return bit_cast(round(1.0f * (1<<23), - x + 121.274057500f - - 1.490129070f * f - + 27.728023300f / (4.84252568f - f))); -} - -SI F approx_powf(F x, F y) { - return if_then_else(x == 0, 0 - , approx_pow2(approx_log2(x) * y)); -} - -SI F from_half(U16 h) { -#if defined(__aarch64__) && !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds. - return vcvt_f32_f16(h); - -#elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) - return _mm256_cvtph_ps(h); - -#else - // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias. - U32 sem = expand(h), - s = sem & 0x8000, - em = sem ^ s; - - // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero. - auto denorm = (I32)em < 0x0400; // I32 comparison is often quicker, and always safe here. - return if_then_else(denorm, F(0) - , bit_cast( (s<<16) + (em<<13) + ((127-15)<<23) )); -#endif -} - -SI U16 to_half(F f) { -#if defined(__aarch64__) && !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds. - return vcvt_f16_f32(f); - -#elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) - return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION); - -#else - // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias. - U32 sem = bit_cast(f), - s = sem & 0x80000000, - em = sem ^ s; - - // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero. - auto denorm = (I32)em < 0x38800000; // I32 comparison is often quicker, and always safe here. - return pack(if_then_else(denorm, U32(0) - , (s>>16) + (em>>13) - ((127-15)<<10))); -#endif -} - - - -#endif//SkJumper_vectors_DEFINED -- cgit v1.2.3