diff options
Diffstat (limited to 'src/splicer/SkSplicer_stages.cpp')
-rw-r--r-- | src/splicer/SkSplicer_stages.cpp | 43 |
1 files changed, 38 insertions, 5 deletions
diff --git a/src/splicer/SkSplicer_stages.cpp b/src/splicer/SkSplicer_stages.cpp index c45f204e22..2d83996f7d 100644 --- a/src/splicer/SkSplicer_stages.cpp +++ b/src/splicer/SkSplicer_stages.cpp @@ -33,6 +33,7 @@ AI static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); } AI static U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); } + AI static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; } #elif defined(__ARM_NEON__) #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__) #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb. @@ -53,6 +54,7 @@ AI static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); } AI static U32 round(F v, F scale) { return vcvt_u32_f32(fma(v,scale,0.5f)); } + AI static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; } #else #if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__) #error On x86, compile with -mavx2 -mfma -mf16c. @@ -72,11 +74,20 @@ AI static F rsqrt(F v) { return _mm256_rsqrt_ps (v); } AI static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } AI static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); } + + AI static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); } #endif AI static F cast (U32 v) { return __builtin_convertvector((I32)v, F); } AI static U32 expand(U8 v) { return __builtin_convertvector( v, U32); } +template <typename T, typename P> +AI static T unaligned_load(const P* p) { + T v; + memcpy(&v, p, sizeof(v)); + return v; +} + // We'll be compiling this file to an object file, then extracting parts of it into // SkSplicer_generated.h. It's easier to do if the function names are not C++ mangled. // On ARMv7, use aapcs-vfp calling convention to pass as much data in registers as possible. @@ -241,8 +252,7 @@ STAGE(to_srgb) { STAGE(scale_u8) { auto ptr = *(const uint8_t**)ctx + x; - U8 scales; - memcpy(&scales, ptr, sizeof(scales)); + auto scales = unaligned_load<U8>(ptr); auto c = cast(expand(scales)) * k->_1_255; r = r * c; @@ -251,12 +261,24 @@ STAGE(scale_u8) { a = a * c; } +STAGE(load_tables) { + struct Ctx { + const uint32_t* src; + const float *r, *g, *b; + }; + auto c = (const Ctx*)ctx; + + auto px = unaligned_load<U32>(c->src + x); + r = gather(c->r, (px ) & k->_0x000000ff); + g = gather(c->g, (px >> 8) & k->_0x000000ff); + b = gather(c->b, (px >> 16) & k->_0x000000ff); + a = cast( (px >> 24)) * k->_1_255; +} + STAGE(load_8888) { auto ptr = *(const uint32_t**)ctx + x; - U32 px; - memcpy(&px, ptr, sizeof(px)); - + auto px = unaligned_load<U32>(ptr); r = cast((px ) & k->_0x000000ff) * k->_1_255; g = cast((px >> 8) & k->_0x000000ff) * k->_1_255; b = cast((px >> 16) & k->_0x000000ff) * k->_1_255; @@ -347,3 +369,14 @@ STAGE(store_f16) { _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567)); #endif } + +STAGE(matrix_3x4) { + auto m = (const float*)ctx; + + auto R = fma(r,m[0], fma(g,m[3], fma(b,m[6], m[ 9]))), + G = fma(r,m[1], fma(g,m[4], fma(b,m[7], m[10]))), + B = fma(r,m[2], fma(g,m[5], fma(b,m[8], m[11]))); + r = R; + g = G; + b = B; +} |