/* * Copyright 2017 Google Inc. * * Use of this source code is governed by a BSD-style license that can be * found in the LICENSE file. */ #include "SkJumper.h" #include // It's tricky to relocate code referencing ordinary constants, so we read them from this struct. using K = const SkJumper_constants; #if !defined(JUMPER) // This path should lead to portable code that can be compiled directly into Skia. // (All other paths are compiled offline by Clang into SkJumper_generated.h.) #include using F = float; using I32 = int32_t; using U32 = uint32_t; using U8 = uint8_t; static F mad(F f, F m, F a) { return f*m+a; } static F min(F a, F b) { return fminf(a,b); } static F max(F a, F b) { return fmaxf(a,b); } static F rcp (F v) { return 1.0f / v; } static F rsqrt(F v) { return 1.0f / sqrtf(v); } static U32 round(F v, F scale) { return (uint32_t)lrintf(v*scale); } static F if_then_else(I32 c, F t, F e) { return c ? t : e; } static F gather(const float* p, U32 ix) { return p[ix]; } #define WRAP(name) sk_##name #elif defined(__aarch64__) #include // Since we know we're using Clang, we can use its vector extensions. using F = float __attribute__((ext_vector_type(4))); using I32 = int32_t __attribute__((ext_vector_type(4))); using U32 = uint32_t __attribute__((ext_vector_type(4))); using U8 = uint8_t __attribute__((ext_vector_type(4))); // We polyfill a few routines that Clang doesn't build into ext_vector_types. static F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); } static F min(F a, F b) { return vminq_f32(a,b); } static F max(F a, F b) { return vmaxq_f32(a,b); } static F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; } static F rsqrt(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; } static U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); } static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); } static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; } #define WRAP(name) sk_##name##_aarch64 #elif defined(__arm__) #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__) #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb. #endif #include // We can pass {s0-s15} as arguments under AAPCS-VFP. We'll slice that as 8 d-registers. using F = float __attribute__((ext_vector_type(2))); using I32 = int32_t __attribute__((ext_vector_type(2))); using U32 = uint32_t __attribute__((ext_vector_type(2))); using U8 = uint8_t __attribute__((ext_vector_type(2))); static F mad(F f, F m, F a) { return vfma_f32(a,f,m); } static F min(F a, F b) { return vmin_f32(a,b); } static F max(F a, F b) { return vmax_f32(a,b); } static F rcp (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e ) * e; } static F rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; } static U32 round(F v, F scale) { return vcvt_u32_f32(mad(v,scale,0.5f)); } static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); } static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; } #define WRAP(name) sk_##name##_vfp4 #elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__) #include // These are __m256 and __m256i, but friendlier and strongly-typed. using F = float __attribute__((ext_vector_type(8))); using I32 = int32_t __attribute__((ext_vector_type(8))); using U32 = uint32_t __attribute__((ext_vector_type(8))); using U8 = uint8_t __attribute__((ext_vector_type(8))); static F mad(F f, F m, F a) { return _mm256_fmadd_ps(f,m,a);} static F min(F a, F b) { return _mm256_min_ps(a,b); } static F max(F a, F b) { return _mm256_max_ps(a,b); } static F rcp (F v) { return _mm256_rcp_ps (v); } static F rsqrt(F v) { return _mm256_rsqrt_ps(v); } static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); } static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); } #define WRAP(name) sk_##name##_hsw #elif defined(__AVX__) #include using F = float __attribute__((ext_vector_type(8))); using I32 = int32_t __attribute__((ext_vector_type(8))); using U32 = uint32_t __attribute__((ext_vector_type(8))); using U8 = uint8_t __attribute__((ext_vector_type(8))); static F mad(F f, F m, F a) { return f*m+a; } static F min(F a, F b) { return _mm256_min_ps(a,b); } static F max(F a, F b) { return _mm256_max_ps(a,b); } static F rcp (F v) { return _mm256_rcp_ps (v); } static F rsqrt(F v) { return _mm256_rsqrt_ps(v); } static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); } static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } static F gather(const float* p, U32 ix) { return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]], p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], }; } #define WRAP(name) sk_##name##_avx #elif defined(__SSE2__) #include using F = float __attribute__((ext_vector_type(4))); using I32 = int32_t __attribute__((ext_vector_type(4))); using U32 = uint32_t __attribute__((ext_vector_type(4))); using U8 = uint8_t __attribute__((ext_vector_type(4))); static F mad(F f, F m, F a) { return f*m+a; } static F min(F a, F b) { return _mm_min_ps(a,b); } static F max(F a, F b) { return _mm_max_ps(a,b); } static F rcp (F v) { return _mm_rcp_ps (v); } static F rsqrt(F v) { return _mm_rsqrt_ps(v); } static U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); } static F if_then_else(I32 c, F t, F e) { #if defined(__SSE4_1__) return _mm_blendv_ps(e,t,c); #else return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e)); #endif } static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; } #if defined(__SSE4_1__) #define WRAP(name) sk_##name##_sse41 #else #define WRAP(name) sk_##name##_sse2 #endif #endif static F lerp(F from, F to, F t) { return mad(to-from, t, from); } // We need to be a careful with casts. // (F)x means cast x to float in the portable path, but bit_cast x to float in the others. // These named casts and bit_cast() are always what they seem to be. #if defined(JUMPER) static F cast (U32 v) { return __builtin_convertvector((I32)v, F); } static U32 expand(U8 v) { return __builtin_convertvector( v, U32); } #else static F cast (U32 v) { return (F)v; } static U32 expand(U8 v) { return (U32)v; } #endif template static T unaligned_load(const P* p) { T v; memcpy(&v, p, sizeof(v)); return v; } template static Dst bit_cast(const Src& src) { static_assert(sizeof(Dst) == sizeof(Src), ""); return unaligned_load(&src); } // Sometimes we want to work with 4 floats directly, regardless of the depth of the F vector. #if defined(JUMPER) using F4 = float __attribute__((ext_vector_type(4))); #else struct F4 { float vals[4]; float operator[](int i) const { return vals[i]; } }; #endif // Stages tail call between each other by following program, // an interlaced sequence of Stage pointers and context pointers. using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F); static void* load_and_inc(void**& program) { #if defined(__GNUC__) && defined(__x86_64__) // Passing program as the second Stage argument makes it likely that it's in %rsi, // so this is usually a single instruction *program++. void* rax; asm("lodsq" : "=a"(rax), "+S"(program)); // Write-only %rax, read-write %rsi. return rax; // When a Stage uses its ctx pointer, this optimization typically cuts an instruction: // mov (%rsi), %rcx // ctx = program[0] // ... // mov 0x8(%rsi), %rax // next = program[1] // add $0x10, %rsi // program += 2 // jmpq *%rax // JUMP! // becomes // lods %ds:(%rsi),%rax // ctx = *program++; // ... // lods %ds:(%rsi),%rax // next = *program++; // jmpq *%rax // JUMP! // // When a Stage doesn't use its ctx pointer, it's 3 instructions either way, // but using lodsq (a 2-byte instruction) tends to trim a few bytes. #else // On ARM *program++ compiles into a single instruction without any handholding. return *program++; #endif } #define STAGE(name) \ static void name##_k(size_t& x, void* ctx, K* k, \ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ extern "C" void WRAP(name)(size_t x, void** program, K* k, \ F r, F g, F b, F a, F dr, F dg, F db, F da) { \ auto ctx = load_and_inc(program); \ name##_k(x,ctx,k, r,g,b,a, dr,dg,db,da); \ auto next = (Stage*)load_and_inc(program); \ next(x,program,k, r,g,b,a, dr,dg,db,da); \ } \ static void name##_k(size_t& x, void* ctx, K* k, \ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) // Some glue stages that don't fit the normal pattern of stages. #if defined(JUMPER) && defined(WIN) __attribute__((ms_abi)) #endif extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) { F v{}; size_t stride = sizeof(F) / sizeof(float); auto start = (Stage*)load_and_inc(program); while (x + stride <= limit) { start(x,program,k, v,v,v,v, v,v,v,v); x += stride; } return x; } // Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller). extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {} // We can now define Stages! // Some things to keep in mind while writing Stages: // - do not branch; (i.e. avoid jmp) // - do not call functions that don't inline; (i.e. avoid call, ret) // - do not use constant literals other than 0, ~0 and 0.0f. (i.e. avoid rip relative addressing) // // Some things that should work fine: // - 0, ~0, and 0.0f; // - arithmetic; // - functions of F and U32 that we've defined above; // - temporary values; // - lambdas; // - memcpy() with a compile-time constant size argument. STAGE(seed_shader) { auto y = *(const int*)ctx; // It's important for speed to explicitly cast(x) and cast(y), // which has the effect of splatting them to vectors before converting to floats. // On Intel this breaks a data dependency on previous loop iterations' registers. r = cast(x) + k->_0_5 + unaligned_load(k->iota); g = cast(y) + k->_0_5; b = k->_1; a = 0; dr = dg = db = da = 0; } STAGE(constant_color) { auto rgba = unaligned_load(ctx); r = rgba[0]; g = rgba[1]; b = rgba[2]; a = rgba[3]; } STAGE(clear) { r = g = b = a = 0; } STAGE(plus_) { r = r + dr; g = g + dg; b = b + db; a = a + da; } STAGE(srcover) { auto A = k->_1 - a; r = mad(dr, A, r); g = mad(dg, A, g); b = mad(db, A, b); a = mad(da, A, a); } STAGE(dstover) { auto DA = k->_1 - da; r = mad(r, DA, dr); g = mad(g, DA, dg); b = mad(b, DA, db); a = mad(a, DA, da); } STAGE(clamp_0) { r = max(r, 0); g = max(g, 0); b = max(b, 0); a = max(a, 0); } STAGE(clamp_1) { r = min(r, k->_1); g = min(g, k->_1); b = min(b, k->_1); a = min(a, k->_1); } STAGE(clamp_a) { a = min(a, k->_1); r = min(r, a); g = min(g, a); b = min(b, a); } STAGE(set_rgb) { auto rgb = (const float*)ctx; r = rgb[0]; g = rgb[1]; b = rgb[2]; } STAGE(swap_rb) { auto tmp = r; r = b; b = tmp; } STAGE(swap) { auto swap = [](F& v, F& dv) { auto tmp = v; v = dv; dv = tmp; }; swap(r, dr); swap(g, dg); swap(b, db); swap(a, da); } STAGE(move_src_dst) { dr = r; dg = g; db = b; da = a; } STAGE(move_dst_src) { r = dr; g = dg; b = db; a = da; } STAGE(premul) { r = r * a; g = g * a; b = b * a; } STAGE(unpremul) { auto scale = if_then_else(a == 0, 0, k->_1 / a); r = r * scale; g = g * scale; b = b * scale; } STAGE(from_srgb) { auto fn = [&](F s) { auto lo = s * k->_1_1292; auto hi = mad(s*s, mad(s, k->_03000, k->_06975), k->_00025); return if_then_else(s < k->_0055, lo, hi); }; r = fn(r); g = fn(g); b = fn(b); } STAGE(to_srgb) { auto fn = [&](F l) { F sqrt = rcp (rsqrt(l)), ftrt = rsqrt(rsqrt(l)); auto lo = l * k->_1246; auto hi = min(k->_1, mad(k->_0411192, ftrt, mad(k->_0689206, sqrt, k->n_00988))); return if_then_else(l < k->_00043, lo, hi); }; r = fn(r); g = fn(g); b = fn(b); } STAGE(scale_u8) { auto ptr = *(const uint8_t**)ctx + x; auto scales = unaligned_load(ptr); auto c = cast(expand(scales)) * k->_1_255; r = r * c; g = g * c; b = b * c; a = a * c; } STAGE(lerp_u8) { auto ptr = *(const uint8_t**)ctx + x; auto scales = unaligned_load(ptr); auto c = cast(expand(scales)) * k->_1_255; r = lerp(dr, r, c); g = lerp(dg, g, c); b = lerp(db, b, c); a = lerp(da, a, c); } STAGE(load_tables) { struct Ctx { const uint32_t* src; const float *r, *g, *b; }; auto c = (const Ctx*)ctx; auto px = unaligned_load(c->src + x); r = gather(c->r, (px ) & k->_0x000000ff); g = gather(c->g, (px >> 8) & k->_0x000000ff); b = gather(c->b, (px >> 16) & k->_0x000000ff); a = cast( (px >> 24)) * k->_1_255; } STAGE(load_8888) { auto ptr = *(const uint32_t**)ctx + x; auto px = unaligned_load(ptr); r = cast((px ) & k->_0x000000ff) * k->_1_255; g = cast((px >> 8) & k->_0x000000ff) * k->_1_255; b = cast((px >> 16) & k->_0x000000ff) * k->_1_255; a = cast((px >> 24) ) * k->_1_255; } STAGE(store_8888) { auto ptr = *(uint32_t**)ctx + x; U32 px = round(r, k->_255) | round(g, k->_255) << 8 | round(b, k->_255) << 16 | round(a, k->_255) << 24; memcpy(ptr, &px, sizeof(px)); } STAGE(load_f16) { auto ptr = *(const uint64_t**)ctx + x; #if !defined(JUMPER) auto half_to_float = [&](int16_t h) { if (h < 0x0400) { h = 0; } // Flush denorm and negative to zero. return bit_cast(h << 13) // Line up the mantissa, * bit_cast(U32(k->_0x77800000)); // then fix up the exponent. }; auto rgba = (const int16_t*)ptr; r = half_to_float(rgba[0]); g = half_to_float(rgba[1]); b = half_to_float(rgba[2]); a = half_to_float(rgba[3]); #elif defined(__aarch64__) auto halfs = vld4_f16((const float16_t*)ptr); r = vcvt_f32_f16(halfs.val[0]); g = vcvt_f32_f16(halfs.val[1]); b = vcvt_f32_f16(halfs.val[2]); a = vcvt_f32_f16(halfs.val[3]); #elif defined(__arm__) auto rb_ga = vld2_f16((const float16_t*)ptr); auto rb = vcvt_f32_f16(rb_ga.val[0]), ga = vcvt_f32_f16(rb_ga.val[1]); r = {rb[0], rb[2]}; g = {ga[0], ga[2]}; b = {rb[1], rb[3]}; a = {ga[1], ga[3]}; #elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__) auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0), _23 = _mm_loadu_si128(((__m128i*)ptr) + 1), _45 = _mm_loadu_si128(((__m128i*)ptr) + 2), _67 = _mm_loadu_si128(((__m128i*)ptr) + 3); auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3 _46 = _mm_unpacklo_epi16(_45, _67), _57 = _mm_unpackhi_epi16(_45, _67); auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3 rg4567 = _mm_unpacklo_epi16(_46, _57), ba4567 = _mm_unpackhi_epi16(_46, _57); r = _mm256_cvtph_ps(_mm_unpacklo_epi64(rg0123, rg4567)); g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567)); b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567)); a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567)); #elif defined(__AVX__) auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0), _23 = _mm_loadu_si128(((__m128i*)ptr) + 1), _45 = _mm_loadu_si128(((__m128i*)ptr) + 2), _67 = _mm_loadu_si128(((__m128i*)ptr) + 3); auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3 _46 = _mm_unpacklo_epi16(_45, _67), _57 = _mm_unpackhi_epi16(_45, _67); auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3 rg4567 = _mm_unpacklo_epi16(_46, _57), ba4567 = _mm_unpackhi_epi16(_46, _57); // half_to_float() slows down ~10x for denorm inputs, so we flush them to zero. // With a signed comparison this conveniently also flushes negative half floats to zero. auto ftz = [k](__m128i v) { return _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(k->_0x04000400)), v); }; rg0123 = ftz(rg0123); ba0123 = ftz(ba0123); rg4567 = ftz(rg4567); ba4567 = ftz(ba4567); U32 R = _mm256_setr_m128i(_mm_unpacklo_epi16(rg0123, _mm_setzero_si128()), _mm_unpacklo_epi16(rg4567, _mm_setzero_si128())), G = _mm256_setr_m128i(_mm_unpackhi_epi16(rg0123, _mm_setzero_si128()), _mm_unpackhi_epi16(rg4567, _mm_setzero_si128())), B = _mm256_setr_m128i(_mm_unpacklo_epi16(ba0123, _mm_setzero_si128()), _mm_unpacklo_epi16(ba4567, _mm_setzero_si128())), A = _mm256_setr_m128i(_mm_unpackhi_epi16(ba0123, _mm_setzero_si128()), _mm_unpackhi_epi16(ba4567, _mm_setzero_si128())); auto half_to_float = [&](U32 h) { return bit_cast(h << 13) // Line up the mantissa, * bit_cast(U32(k->_0x77800000)); // then fix up the exponent. }; r = half_to_float(R); g = half_to_float(G); b = half_to_float(B); a = half_to_float(A); #elif defined(__SSE2__) auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0), _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 _13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3 auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3 // Same deal as AVX, flush denorms and negatives to zero. auto ftz = [k](__m128i v) { return _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(k->_0x04000400)), v); }; rg = ftz(rg); ba = ftz(ba); auto half_to_float = [&](U32 h) { return bit_cast(h << 13) // Line up the mantissa, * bit_cast(U32(k->_0x77800000)); // then fix up the exponent. }; r = half_to_float(_mm_unpacklo_epi16(rg, _mm_setzero_si128())); g = half_to_float(_mm_unpackhi_epi16(rg, _mm_setzero_si128())); b = half_to_float(_mm_unpacklo_epi16(ba, _mm_setzero_si128())); a = half_to_float(_mm_unpackhi_epi16(ba, _mm_setzero_si128())); #endif } STAGE(store_f16) { auto ptr = *(uint64_t**)ctx + x; #if !defined(JUMPER) auto float_to_half = [&](F f) { return bit_cast(f * bit_cast(U32(k->_0x07800000))) // Fix up the exponent, >> 13; // then line up the mantissa. }; auto rgba = (int16_t*)ptr; rgba[0] = float_to_half(r); rgba[1] = float_to_half(g); rgba[2] = float_to_half(b); rgba[3] = float_to_half(a); #elif defined(__aarch64__) float16x4x4_t halfs = {{ vcvt_f16_f32(r), vcvt_f16_f32(g), vcvt_f16_f32(b), vcvt_f16_f32(a), }}; vst4_f16((float16_t*)ptr, halfs); #elif defined(__arm__) float16x4x2_t rb_ga = {{ vcvt_f16_f32(float32x4_t{r[0], b[0], r[1], b[1]}), vcvt_f16_f32(float32x4_t{g[0], a[0], g[1], a[1]}), }}; vst2_f16((float16_t*)ptr, rb_ga); #elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__) auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION), G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION), B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION), A = _mm256_cvtps_ph(a, _MM_FROUND_CUR_DIRECTION); auto rg0123 = _mm_unpacklo_epi16(R, G), // r0 g0 r1 g1 r2 g2 r3 g3 rg4567 = _mm_unpackhi_epi16(R, G), // r4 g4 r5 g5 r6 g6 r7 g7 ba0123 = _mm_unpacklo_epi16(B, A), ba4567 = _mm_unpackhi_epi16(B, A); _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123)); _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123)); _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567)); _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567)); #elif defined(__AVX__) auto float_to_half = [&](F f) { return bit_cast(f * bit_cast(U32(k->_0x07800000))) // Fix up the exponent, >> 13; // then line up the mantissa. }; U32 R = float_to_half(r), G = float_to_half(g), B = float_to_half(b), A = float_to_half(a); auto r0123 = _mm256_extractf128_si256(R, 0), r4567 = _mm256_extractf128_si256(R, 1), g0123 = _mm256_extractf128_si256(G, 0), g4567 = _mm256_extractf128_si256(G, 1), b0123 = _mm256_extractf128_si256(B, 0), b4567 = _mm256_extractf128_si256(B, 1), a0123 = _mm256_extractf128_si256(A, 0), a4567 = _mm256_extractf128_si256(A, 1); auto rg0123 = r0123 | _mm_slli_si128(g0123,2), rg4567 = r4567 | _mm_slli_si128(g4567,2), ba0123 = b0123 | _mm_slli_si128(a0123,2), ba4567 = b4567 | _mm_slli_si128(a4567,2); _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123)); _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123)); _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567)); _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567)); #elif defined(__SSE2__) auto float_to_half = [&](F f) { return bit_cast(f * bit_cast(U32(k->_0x07800000))) // Fix up the exponent, >> 13; // then line up the mantissa. }; U32 R = float_to_half(r), G = float_to_half(g), B = float_to_half(b), A = float_to_half(a); U32 rg = R | _mm_slli_si128(G,2), ba = B | _mm_slli_si128(A,2); _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba)); _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba)); #endif } static F clamp(const F& v, float limit) { F l = bit_cast(bit_cast(F(limit)) + U32(0xffffffff)); // limit - 1 ulp return max(0, min(v, l)); } STAGE(clamp_x) { r = clamp(r, *(const float*)ctx); } STAGE(clamp_y) { g = clamp(g, *(const float*)ctx); } STAGE(matrix_2x3) { auto m = (const float*)ctx; auto R = mad(r,m[0], mad(g,m[2], m[4])), G = mad(r,m[1], mad(g,m[3], m[5])); r = R; g = G; } STAGE(matrix_3x4) { auto m = (const float*)ctx; auto R = mad(r,m[0], mad(g,m[3], mad(b,m[6], m[ 9]))), G = mad(r,m[1], mad(g,m[4], mad(b,m[7], m[10]))), B = mad(r,m[2], mad(g,m[5], mad(b,m[8], m[11]))); r = R; g = G; b = B; } STAGE(linear_gradient_2stops) { struct Ctx { F4 c0, dc; }; auto c = unaligned_load(ctx); auto t = r; r = mad(t, c.dc[0], c.c0[0]); g = mad(t, c.dc[1], c.c0[1]); b = mad(t, c.dc[2], c.c0[2]); a = mad(t, c.dc[3], c.c0[3]); }