diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/splicer/SkSplicer_stages.cpp | 190 | ||||
-rwxr-xr-x | src/splicer/build_stages.py | 12 |
2 files changed, 151 insertions, 51 deletions
diff --git a/src/splicer/SkSplicer_stages.cpp b/src/splicer/SkSplicer_stages.cpp index 7f9a5b3802..19daee3fbb 100644 --- a/src/splicer/SkSplicer_stages.cpp +++ b/src/splicer/SkSplicer_stages.cpp @@ -8,14 +8,31 @@ #include "SkSplicer_shared.h" #include <string.h> -#if !defined(__clang__) - #error This file is not like the rest of Skia. It must be compiled with clang. -#endif - // It's tricky to relocate code referencing ordinary constants, so we read them from this struct. using K = const SkSplicer_constants; -#if defined(__aarch64__) +#if !defined(SPLICER) && !defined(JUMPER) + // This path should lead to portable code that can be compiled directly into Skia. + // (All other paths are compiled offline by Clang into SkSplicer_generated.h.) + #include <math.h> + + using F = float; + using I32 = int32_t; + using U32 = uint32_t; + using U8 = uint8_t; + + static F fma(F f, F m, F a) { return f*m+a; } + static F min(F a, F b) { return fminf(a,b); } + static F max(F a, F b) { return fmaxf(a,b); } + static F rcp (F v) { return 1.0f / v; } + static F rsqrt(F v) { return 1.0f / sqrtf(v); } + static U32 round(F v, F scale) { return (uint32_t)(v*scale); } + + static F if_then_else(I32 c, F t, F e) { return c ? t : e; } + + static F gather(const float* p, U32 ix) { return p[ix]; } + +#elif defined(__aarch64__) #include <arm_neon.h> // Since we know we're using Clang, we can use its vector extensions. @@ -35,6 +52,7 @@ using K = const SkSplicer_constants; static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); } static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; } + #elif defined(__ARM_NEON__) #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__) #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb. @@ -57,6 +75,7 @@ using K = const SkSplicer_constants; static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); } static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; } + #elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__) #include <immintrin.h> @@ -76,6 +95,7 @@ using K = const SkSplicer_constants; static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); } + #elif defined(__SSE2__) #include <immintrin.h> @@ -102,8 +122,16 @@ using K = const SkSplicer_constants; static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; } #endif -static F cast (U32 v) { return __builtin_convertvector((I32)v, F); } -static U32 expand(U8 v) { return __builtin_convertvector( v, U32); } +// We need to be a careful with casts. +// (F)x means cast x to float in the portable path, but bit_cast x to float in the others. +// These named casts and bit_cast() are always what they seem to be. +#if !defined(SPLICER) && !defined(JUMPER) + static F cast (U32 v) { return (F)v; } + static U32 expand(U8 v) { return (U32)v; } +#else + static F cast (U32 v) { return __builtin_convertvector((I32)v, F); } + static U32 expand(U8 v) { return __builtin_convertvector( v, U32); } +#endif template <typename T, typename P> static T unaligned_load(const P* p) { @@ -112,55 +140,115 @@ static T unaligned_load(const P* p) { return v; } -using F4 = float __attribute__((ext_vector_type(4))); +template <typename Dst, typename Src> +static Dst bit_cast(const Src& src) { + static_assert(sizeof(Dst) == sizeof(Src), ""); + return unaligned_load<Dst>(&src); +} + +// Sometimes we want to work with 4 floats directly, regardless of the depth of the F vector. +#if !defined(SPLICER) && !defined(JUMPER) + struct F4 { + float vals[4]; + float operator[](int i) const { return vals[i]; } + }; +#else + using F4 = float __attribute__((ext_vector_type(4))); +#endif // We'll be compiling this file to an object file, then extracting parts of it into // SkSplicer_generated.h. It's easier to do if the function names are not C++ mangled. #define C extern "C" -// Stages all fit a common interface that allows SkSplicer to splice them together. -using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F); - -// Stage's arguments act as the working set of registers within the final spliced function. -// Here's a little primer on the x86-64/aarch64 ABIs: -// x: rdi/x0 x and limit work to drive the loop, see loop_start in SkSplicer.cpp. -// limit: rsi/x1 -// ctx: rdx/x2 Look for set_ctx in SkSplicer.cpp to see how this works. -// k: rcx/x3 -// vectors: ymm0-ymm7/v0-v7 - +#if defined(SPLICER) + // Splicer Stages all fit a common interface that allows SkSplicer to splice them together. + // (This is just for reference... nothing uses this type when we're in Splicer mode.) + using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F); + + // Stage's arguments act as the working set of registers within the final spliced function. + // Here's a little primer on the x86-64/aarch64 ABIs: + // x: rdi/x0 x and limit work to drive the loop, see loop_start in SkSplicer.cpp. + // limit: rsi/x1 + // ctx: rdx/x2 Look for set_ctx in SkSplicer.cpp to see how this works. + // k: rcx/x3 + // vectors: ymm0-ymm7/v0-v7 + + // done() is the key to this entire splicing strategy. + // + // It matches the signature of Stage, so all the registers are kept live. + // Every Stage calls done() and so will end in a single jmp (i.e. tail-call) into done(), + // which marks the point where we can splice one Stage onto the next. + // + // The lovely bit is that we don't have to define done(), just declare it. + C void done(size_t, size_t, void*, K*, F,F,F,F, F,F,F,F); + + // This should feel familiar to anyone who's read SkRasterPipeline_opts.h. + // It's just a convenience to make a valid, spliceable Stage, nothing magic. + #define STAGE(name) \ + static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \ + F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ + C void name(size_t x, size_t limit, void* ctx, K* k, \ + F r, F g, F b, F a, F dr, F dg, F db, F da) { \ + name##_k(x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \ + done (x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \ + } \ + static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \ + F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) +#else + // Jumper and portable Stages tail call between each other by following + // program, an interlaced sequence of Stage pointers and context pointers. + using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F); + + static void* load_and_inc(void**& program) { + #if defined(__GNUC__) && defined(__x86_64__) + // Passing program as the second Stage argument makes it likely that it's in %rsi, + // so this is usually a single instruction *program++. + void* rax; + asm("lodsq" : "=a"(rax), "+S"(program)); // Write-only %rax, read-write %rsi. + return rax; + // When a Stage uses its ctx pointer, this optimization typically cuts an instruction: + // mov (%rsi), %rcx // ctx = program[0] + // ... + // mov 0x8(%rsi), %rax // next = program[1] + // add $0x10, %rsi // program += 2 + // jmpq *%rax // JUMP! + // becomes + // lods %ds:(%rsi),%rax // ctx = *program++; + // ... + // lods %ds:(%rsi),%rax // next = *program++; + // jmpq *%rax // JUMP! + // + // When a Stage doesn't use its ctx pointer, it's 3 instructions either way, + // but using lodsq (a 2-byte instruction) tends to trim a few bytes. + #else + // On ARM *program++ compiles into a single instruction without any handholding. + return *program++; + #endif + } -// done() is the key to this entire splicing strategy. -// -// It matches the signature of Stage, so all the registers are kept live. -// Every Stage calls done() and so will end in a single jmp (i.e. tail-call) into done(), -// which marks the point where we can splice one Stage onto the next. -// -// The lovely bit is that we don't have to define done(), just declare it. -C void done(size_t, size_t, void*, K*, F,F,F,F, F,F,F,F); - -// This should feel familiar to anyone who's read SkRasterPipeline_opts.h. -// It's just a convenience to make a valid, spliceable Stage, nothing magic. -#define STAGE(name) \ - static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \ - F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ - C void name(size_t x, size_t limit, void* ctx, K* k, \ - F r, F g, F b, F a, F dr, F dg, F db, F da) { \ - name##_k(x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \ - done (x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \ - } \ - static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \ - F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) + #define STAGE(name) \ + static void name##_k(size_t& x, void* ctx, K* k, \ + F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ + C void name(size_t x, void** program, K* k, \ + F r, F g, F b, F a, F dr, F dg, F db, F da) { \ + auto ctx = load_and_inc(program); \ + name##_k(x,ctx,k, r,g,b,a, dr,dg,db,da); \ + auto next = (Stage*)load_and_inc(program); \ + next(x,program,k, r,g,b,a, dr,dg,db,da); \ + } \ + static void name##_k(size_t& x, void* ctx, K* k, \ + F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) +#endif // We can now define Stages! // Some things to keep in mind while writing Stages: // - do not branch; (i.e. avoid jmp) -// - do not call functions that don't inline; (i.e. avoid call, ret, stack use) +// - do not call functions that don't inline; (i.e. avoid call, ret) // - do not use constant literals other than 0 and 0.0f. (i.e. avoid rip relative addressing) // // Some things that should work fine: -// - 0 and 0.0f; +// - 0, ~0, and 0.0f; // - arithmetic; // - functions of F and U32 that we've defined above; // - temporary values; @@ -350,7 +438,9 @@ STAGE(store_8888) { STAGE(load_f16) { auto ptr = *(const uint64_t**)ctx + x; -#if defined(__aarch64__) +#if !defined(SPLICER) && !defined(JUMPER) + // TODO: +#elif defined(__aarch64__) auto halfs = vld4_f16((const float16_t*)ptr); r = vcvt_f32_f16(halfs.val[0]); g = vcvt_f32_f16(halfs.val[1]); @@ -395,8 +485,8 @@ STAGE(load_f16) { ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3 auto half_to_float = [&](U32 h) { - return (F)(h << 13) // Line up the mantissa, - * (F)U32(k->_0x77800000); // then fix up the exponent. + return bit_cast<F>(h << 13) // Line up the mantissa, + * bit_cast<F>(U32(k->_0x77800000)); // then fix up the exponent. }; r = half_to_float(_mm_unpacklo_epi16(rg, _mm_setzero_si128())); @@ -409,7 +499,9 @@ STAGE(load_f16) { STAGE(store_f16) { auto ptr = *(uint64_t**)ctx + x; -#if defined(__aarch64__) +#if !defined(SPLICER) && !defined(JUMPER) + // TODO: +#elif defined(__aarch64__) float16x4x4_t halfs = {{ vcvt_f16_f32(r), vcvt_f16_f32(g), @@ -440,8 +532,8 @@ STAGE(store_f16) { _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567)); #elif defined(__SSE2__) auto float_to_half = [&](F f) { - return (U32)(f * (F)U32(k->_0x07800000)) // Fix up the exponent, - >> 13; // then line up the mantissa. + return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000))) // Fix up the exponent, + >> 13; // then line up the mantissa. }; U32 R = float_to_half(r), G = float_to_half(g), @@ -455,7 +547,7 @@ STAGE(store_f16) { } static F clamp(const F& v, float limit) { - F l = (F)((U32)F(limit) + U32(0xffffffff)); // limit - 1 ulp + F l = bit_cast<F>(bit_cast<U32>(F(limit)) + U32(0xffffffff)); // limit - 1 ulp return max(0, min(v, l)); } STAGE(clamp_x) { r = clamp(r, *(const float*)ctx); } diff --git a/src/splicer/build_stages.py b/src/splicer/build_stages.py index a6f55c3b4e..cb7f6faa18 100755 --- a/src/splicer/build_stages.py +++ b/src/splicer/build_stages.py @@ -17,14 +17,22 @@ objdump = 'gobjdump' #ndk = '/home/mtklein/ndk/' #objdump = '/home/mtklein/binutils-2.27/binutils/objdump' -cflags = '-std=c++11 -Os -fomit-frame-pointer'.split() +cflags = '-std=c++11 -Os -fomit-frame-pointer -DSPLICER'.split() + +# This won't stay here long-term. +# It's a stand-in for building SkSplicer_stages.cpp into Skia. +portable = '-USPLICER'.split() +subprocess.check_call(['clang++'] + cflags + portable + + ['-c', 'src/splicer/SkSplicer_stages.cpp'] + + ['-o', 'portable.o']) sse2 = '-mno-red-zone -msse2 -mno-sse3 -mno-ssse3 -mno-sse4.1'.split() subprocess.check_call(['clang++'] + cflags + sse2 + ['-c', 'src/splicer/SkSplicer_stages.cpp'] + ['-o', 'sse2.o']) -sse41 = '-mno-red-zone -msse4.1'.split() +# We're not using sse41.o right now, so use it to test -DJUMPER. +sse41 = '-mno-red-zone -msse4.1 -USPLICER -DJUMPER'.split() subprocess.check_call(['clang++'] + cflags + sse41 + ['-c', 'src/splicer/SkSplicer_stages.cpp'] + ['-o', 'sse41.o']) |