diff options
author | Mike Klein <mtklein@chromium.org> | 2017-01-13 13:18:44 -0500 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2017-01-13 20:30:25 +0000 |
commit | 69f98c738c48baf087b09e30f8d2c67471fc3103 (patch) | |
tree | f5b64875c982951767a023744512e318f615e920 /src/splicer | |
parent | 2873c767fdb6b5f785a88cf593bfee27d044fbb5 (diff) |
SkSplicer: implement load_tables and matrix_3x4
These are enough to splice interesting SkColorSpaceXform pipelines.
SkSplicer_stages.cpp is similar to but still intentionally distinct from
SkRasterPipeline_opts. I hope to unify them next week.
unaligned_load() is nothing tricky... just a little refactor.
Change-Id: I05d0fc38dac985aa351d88776ecc14d2457f2124
Reviewed-on: https://skia-review.googlesource.com/7022
Reviewed-by: Matt Sarett <msarett@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/splicer')
-rw-r--r-- | src/splicer/SkSplicer.cpp | 2 | ||||
-rw-r--r-- | src/splicer/SkSplicer_generated.h | 204 | ||||
-rw-r--r-- | src/splicer/SkSplicer_stages.cpp | 43 |
3 files changed, 244 insertions, 5 deletions
diff --git a/src/splicer/SkSplicer.cpp b/src/splicer/SkSplicer.cpp index fcff0afbde..0295c4f2ae 100644 --- a/src/splicer/SkSplicer.cpp +++ b/src/splicer/SkSplicer.cpp @@ -293,10 +293,12 @@ namespace { case SkRasterPipeline::from_srgb: splice(&buf, kSplice_from_srgb ); break; case SkRasterPipeline::to_srgb: splice(&buf, kSplice_to_srgb ); break; case SkRasterPipeline::scale_u8: splice(&buf, kSplice_scale_u8 ); break; + case SkRasterPipeline::load_tables: splice(&buf, kSplice_load_tables ); break; case SkRasterPipeline::load_8888: splice(&buf, kSplice_load_8888 ); break; case SkRasterPipeline::store_8888: splice(&buf, kSplice_store_8888 ); break; case SkRasterPipeline::load_f16: splice(&buf, kSplice_load_f16 ); break; case SkRasterPipeline::store_f16: splice(&buf, kSplice_store_f16 ); break; + case SkRasterPipeline::matrix_3x4: splice(&buf, kSplice_matrix_3x4 ); break; // No joy (probably just not yet implemented). default: diff --git a/src/splicer/SkSplicer_generated.h b/src/splicer/SkSplicer_generated.h index df4bd03331..228f8ff603 100644 --- a/src/splicer/SkSplicer_generated.h +++ b/src/splicer/SkSplicer_generated.h @@ -222,6 +222,58 @@ static const unsigned int kSplice_scale_u8[] = { 0x6e22de02, // fmul v2.4s, v16.4s, v2.4s 0x6e23de03, // fmul v3.4s, v16.4s, v3.4s }; +static const unsigned int kSplice_load_tables[] = { + 0xa9402848, // ldp x8, x10, [x2] + 0xd37ef409, // lsl x9, x0, #2 + 0x4d40c860, // ld1r {v0.4s}, [x3] + 0x3ce96903, // ldr q3, [x8,x9] + 0xa9412448, // ldp x8, x9, [x2,#16] + 0x4e231c01, // and v1.16b, v0.16b, v3.16b + 0x1e26002e, // fmov w14, s1 + 0x6f380462, // ushr v2.4s, v3.4s, #8 + 0x6f300470, // ushr v16.4s, v3.4s, #16 + 0x8b2e494e, // add x14, x10, w14, uxtw #2 + 0x0e0c3c2b, // mov w11, v1.s[1] + 0x0e143c2c, // mov w12, v1.s[2] + 0x0e1c3c2d, // mov w13, v1.s[3] + 0x4e221c01, // and v1.16b, v0.16b, v2.16b + 0x4e301c02, // and v2.16b, v0.16b, v16.16b + 0x0d4081c0, // ld1 {v0.s}[0], [x14] + 0x1e26002e, // fmov w14, s1 + 0x8b2e490e, // add x14, x8, w14, uxtw #2 + 0x8b2b494b, // add x11, x10, w11, uxtw #2 + 0xbc6c5950, // ldr s16, [x10,w12,uxtw #2] + 0xbc6d5951, // ldr s17, [x10,w13,uxtw #2] + 0x0e0c3c2a, // mov w10, v1.s[1] + 0x0e143c2c, // mov w12, v1.s[2] + 0x0e1c3c2d, // mov w13, v1.s[3] + 0x0d4081c1, // ld1 {v1.s}[0], [x14] + 0x0d409160, // ld1 {v0.s}[1], [x11] + 0xbc6c5912, // ldr s18, [x8,w12,uxtw #2] + 0x0e143c4c, // mov w12, v2.s[2] + 0x1e26004e, // fmov w14, s2 + 0xbc6c5933, // ldr s19, [x9,w12,uxtw #2] + 0x8b2e492c, // add x12, x9, w14, uxtw #2 + 0x8b2a490a, // add x10, x8, w10, uxtw #2 + 0x0e0c3c4f, // mov w15, v2.s[1] + 0x0e1c3c4b, // mov w11, v2.s[3] + 0x0d408182, // ld1 {v2.s}[0], [x12] + 0x0d409141, // ld1 {v1.s}[1], [x10] + 0x6e140600, // mov v0.s[2], v16.s[0] + 0xbc6d5910, // ldr s16, [x8,w13,uxtw #2] + 0x8b2f492a, // add x10, x9, w15, uxtw #2 + 0x0d409142, // ld1 {v2.s}[1], [x10] + 0x6e140641, // mov v1.s[2], v18.s[0] + 0x6e1c0620, // mov v0.s[3], v17.s[0] + 0xbc6b5931, // ldr s17, [x9,w11,uxtw #2] + 0x6e1c0601, // mov v1.s[3], v16.s[0] + 0xbd400c70, // ldr s16, [x3,#12] + 0x6f280463, // ushr v3.4s, v3.4s, #24 + 0x6e140662, // mov v2.s[2], v19.s[0] + 0x4e21d863, // scvtf v3.4s, v3.4s + 0x6e1c0622, // mov v2.s[3], v17.s[0] + 0x4f909063, // fmul v3.4s, v3.4s, v16.s[0] +}; static const unsigned int kSplice_load_8888[] = { 0xf9400048, // ldr x8, [x2] 0xd37ef409, // lsl x9, x0, #2 @@ -281,6 +333,33 @@ static const unsigned int kSplice_store_f16[] = { 0x0e216873, // fcvtn v19.4h, v3.4s 0x0c000510, // st4 {v16.4h-v19.4h}, [x8] }; +static const unsigned int kSplice_matrix_3x4[] = { + 0xaa0203e8, // mov x8, x2 + 0x91009049, // add x9, x2, #0x24 + 0x4ddfc913, // ld1r {v19.4s}, [x8], #4 + 0x4d40c930, // ld1r {v16.4s}, [x9] + 0x9100a049, // add x9, x2, #0x28 + 0x4d40c931, // ld1r {v17.4s}, [x9] + 0x2d435454, // ldp s20, s21, [x2,#24] + 0x9100b049, // add x9, x2, #0x2c + 0xbd402056, // ldr s22, [x2,#32] + 0x4d40c932, // ld1r {v18.4s}, [x9] + 0x4f941050, // fmla v16.4s, v2.4s, v20.s[0] + 0x4f951051, // fmla v17.4s, v2.4s, v21.s[0] + 0x2d415454, // ldp s20, s21, [x2,#8] + 0x4f961052, // fmla v18.4s, v2.4s, v22.s[0] + 0x2d425842, // ldp s2, s22, [x2,#16] + 0x4f951030, // fmla v16.4s, v1.4s, v21.s[0] + 0xbd400115, // ldr s21, [x8] + 0x4f821031, // fmla v17.4s, v1.4s, v2.s[0] + 0x4f961032, // fmla v18.4s, v1.4s, v22.s[0] + 0x4e20ce70, // fmla v16.4s, v19.4s, v0.4s + 0x4f951011, // fmla v17.4s, v0.4s, v21.s[0] + 0x4f941012, // fmla v18.4s, v0.4s, v20.s[0] + 0x4eb01e00, // mov v0.16b, v16.16b + 0x4eb11e21, // mov v1.16b, v17.16b + 0x4eb21e42, // mov v2.16b, v18.16b +}; #elif defined(__ARM_NEON__) @@ -505,6 +584,44 @@ static const unsigned int kSplice_scale_u8[] = { 0xe28dd008, // add sp, sp, #8 0xecbd8b02, // vpop {d8} }; +static const unsigned int kSplice_load_tables[] = { + 0xe92d41f0, // push {r4, r5, r6, r7, r8, lr} + 0xe592c000, // ldr ip, [r2] + 0xf4e30c9f, // vld1.32 {d16[]}, [r3 :32] + 0xe08c5100, // add r5, ip, r0, lsl #2 + 0xe592e004, // ldr lr, [r2, #4] + 0xedd51b00, // vldr d17, [r5] + 0xf24021b1, // vand d18, d16, d17 + 0xe592800c, // ldr r8, [r2, #12] + 0xf3f83031, // vshr.u32 d19, d17, #8 + 0xe5924008, // ldr r4, [r2, #8] + 0xed931a03, // vldr s2, [r3, #12] + 0xee325b90, // vmov.32 r5, d18[1] + 0xee126b90, // vmov.32 r6, d18[0] + 0xf3f02031, // vshr.u32 d18, d17, #16 + 0xf24021b2, // vand d18, d16, d18 + 0xf24001b3, // vand d16, d16, d19 + 0xee127b90, // vmov.32 r7, d18[0] + 0xe08e5105, // add r5, lr, r5, lsl #2 + 0xe08e6106, // add r6, lr, r6, lsl #2 + 0xedd50a00, // vldr s1, [r5] + 0xee325b90, // vmov.32 r5, d18[1] + 0xed960a00, // vldr s0, [r6] + 0xee306b90, // vmov.32 r6, d16[1] + 0xe0887107, // add r7, r8, r7, lsl #2 + 0xe088c105, // add ip, r8, r5, lsl #2 + 0xee105b90, // vmov.32 r5, d16[0] + 0xf3e80031, // vshr.u32 d16, d17, #24 + 0xe0846106, // add r6, r4, r6, lsl #2 + 0xeddc2a00, // vldr s5, [ip] + 0xf3fb0620, // vcvt.f32.s32 d16, d16 + 0xed972a00, // vldr s4, [r7] + 0xf2a039c1, // vmul.f32 d3, d16, d1[0] + 0xedd61a00, // vldr s3, [r6] + 0xe0846105, // add r6, r4, r5, lsl #2 + 0xed961a00, // vldr s2, [r6] + 0xe8bd41f0, // pop {r4, r5, r6, r7, r8, lr} +}; static const unsigned int kSplice_load_8888[] = { 0xe592c000, // ldr ip, [r2] 0xf4e30c9f, // vld1.32 {d16[]}, [r3 :32] @@ -581,6 +698,43 @@ static const unsigned int kSplice_store_f16[] = { 0xe08cc180, // add ip, ip, r0, lsl #3 0xf44c084f, // vst2.16 {d16-d17}, [ip] }; +static const unsigned int kSplice_matrix_3x4[] = { + 0xe282c020, // add ip, r2, #32 + 0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32] + 0xe282c02c, // add ip, r2, #44 + 0xf4ec0c9f, // vld1.32 {d16[]}, [ip :32] + 0xe282c01c, // add ip, r2, #28 + 0xf2420c33, // vfma.f32 d16, d2, d19 + 0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32] + 0xe282c018, // add ip, r2, #24 + 0xf4ec2c9f, // vld1.32 {d18[]}, [ip :32] + 0xe282c024, // add ip, r2, #36 + 0xf4ec1c9f, // vld1.32 {d17[]}, [ip :32] + 0xe282c028, // add ip, r2, #40 + 0xf2421c32, // vfma.f32 d17, d2, d18 + 0xf4ec2c9f, // vld1.32 {d18[]}, [ip :32] + 0xe282c010, // add ip, r2, #16 + 0xf2422c34, // vfma.f32 d18, d2, d20 + 0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32] + 0xe282c00c, // add ip, r2, #12 + 0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32] + 0xe282c014, // add ip, r2, #20 + 0xf2411c34, // vfma.f32 d17, d1, d20 + 0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32] + 0xf2410c34, // vfma.f32 d16, d1, d20 + 0xe282c004, // add ip, r2, #4 + 0xf2412c33, // vfma.f32 d18, d1, d19 + 0xf4e23c9f, // vld1.32 {d19[]}, [r2 :32] + 0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32] + 0xe282c008, // add ip, r2, #8 + 0xf2401c33, // vfma.f32 d17, d0, d19 + 0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32] + 0xf2400c33, // vfma.f32 d16, d0, d19 + 0xf2402c34, // vfma.f32 d18, d0, d20 + 0xf22101b1, // vorr d0, d17, d17 + 0xf22021b0, // vorr d2, d16, d16 + 0xf22211b2, // vorr d1, d18, d18 +}; #else @@ -747,6 +901,30 @@ static const unsigned char kSplice_scale_u8[] = { 0xc5,0xbc,0x59,0xd2, // vmulps %ymm2,%ymm8,%ymm2 0xc5,0xbc,0x59,0xdb, // vmulps %ymm3,%ymm8,%ymm3 }; +static const unsigned char kSplice_load_tables[] = { + 0x48,0x8b,0x02, // mov (%rdx),%rax + 0x4c,0x8b,0x42,0x08, // mov 0x8(%rdx),%r8 + 0xc5,0xfc,0x10,0x1c,0xb8, // vmovups (%rax,%rdi,4),%ymm3 + 0xc4,0xe2,0x7d,0x18,0x11, // vbroadcastss (%rcx),%ymm2 + 0xc5,0xec,0x54,0xcb, // vandps %ymm3,%ymm2,%ymm1 + 0xc5,0xfc,0x57,0xc0, // vxorps %ymm0,%ymm0,%ymm0 + 0xc5,0x7c,0xc2,0xc0,0x00, // vcmpeqps %ymm0,%ymm0,%ymm8 + 0xc4,0x41,0x7c,0x28,0xc8, // vmovaps %ymm8,%ymm9 + 0xc4,0xc2,0x35,0x92,0x04,0x88, // vgatherdps %ymm9,(%r8,%ymm1,4),%ymm0 + 0x48,0x8b,0x42,0x10, // mov 0x10(%rdx),%rax + 0xc5,0xf5,0x72,0xd3,0x08, // vpsrld $0x8,%ymm3,%ymm1 + 0xc5,0x6c,0x54,0xc9, // vandps %ymm1,%ymm2,%ymm9 + 0xc4,0x41,0x7c,0x28,0xd0, // vmovaps %ymm8,%ymm10 + 0xc4,0xa2,0x2d,0x92,0x0c,0x88, // vgatherdps %ymm10,(%rax,%ymm9,4),%ymm1 + 0x48,0x8b,0x42,0x18, // mov 0x18(%rdx),%rax + 0xc5,0xb5,0x72,0xd3,0x10, // vpsrld $0x10,%ymm3,%ymm9 + 0xc4,0x41,0x6c,0x54,0xc9, // vandps %ymm9,%ymm2,%ymm9 + 0xc4,0xa2,0x3d,0x92,0x14,0x88, // vgatherdps %ymm8,(%rax,%ymm9,4),%ymm2 + 0xc5,0xe5,0x72,0xd3,0x18, // vpsrld $0x18,%ymm3,%ymm3 + 0xc5,0xfc,0x5b,0xdb, // vcvtdq2ps %ymm3,%ymm3 + 0xc4,0x62,0x7d,0x18,0x41,0x0c, // vbroadcastss 0xc(%rcx),%ymm8 + 0xc4,0xc1,0x64,0x59,0xd8, // vmulps %ymm8,%ymm3,%ymm3 +}; static const unsigned char kSplice_load_8888[] = { 0x48,0x8b,0x02, // mov (%rdx),%rax 0xc5,0xfc,0x10,0x1c,0xb8, // vmovups (%rax,%rdi,4),%ymm3 @@ -828,6 +1006,32 @@ static const unsigned char kSplice_store_f16[] = { 0xc4,0x41,0x39,0x6a,0xc2, // vpunpckhdq %xmm10,%xmm8,%xmm8 0xc5,0x7a,0x7f,0x44,0xf8,0x30, // vmovdqu %xmm8,0x30(%rax,%rdi,8) }; +static const unsigned char kSplice_matrix_3x4[] = { + 0xc4,0x62,0x7d,0x18,0x0a, // vbroadcastss (%rdx),%ymm9 + 0xc4,0x62,0x7d,0x18,0x52,0x0c, // vbroadcastss 0xc(%rdx),%ymm10 + 0xc4,0x62,0x7d,0x18,0x5a,0x18, // vbroadcastss 0x18(%rdx),%ymm11 + 0xc4,0x62,0x7d,0x18,0x42,0x24, // vbroadcastss 0x24(%rdx),%ymm8 + 0xc4,0x42,0x6d,0xb8,0xc3, // vfmadd231ps %ymm11,%ymm2,%ymm8 + 0xc4,0x42,0x75,0xb8,0xc2, // vfmadd231ps %ymm10,%ymm1,%ymm8 + 0xc4,0x42,0x7d,0xb8,0xc1, // vfmadd231ps %ymm9,%ymm0,%ymm8 + 0xc4,0x62,0x7d,0x18,0x52,0x04, // vbroadcastss 0x4(%rdx),%ymm10 + 0xc4,0x62,0x7d,0x18,0x5a,0x10, // vbroadcastss 0x10(%rdx),%ymm11 + 0xc4,0x62,0x7d,0x18,0x62,0x1c, // vbroadcastss 0x1c(%rdx),%ymm12 + 0xc4,0x62,0x7d,0x18,0x4a,0x28, // vbroadcastss 0x28(%rdx),%ymm9 + 0xc4,0x42,0x6d,0xb8,0xcc, // vfmadd231ps %ymm12,%ymm2,%ymm9 + 0xc4,0x42,0x75,0xb8,0xcb, // vfmadd231ps %ymm11,%ymm1,%ymm9 + 0xc4,0x42,0x7d,0xb8,0xca, // vfmadd231ps %ymm10,%ymm0,%ymm9 + 0xc4,0x62,0x7d,0x18,0x5a,0x08, // vbroadcastss 0x8(%rdx),%ymm11 + 0xc4,0x62,0x7d,0x18,0x62,0x14, // vbroadcastss 0x14(%rdx),%ymm12 + 0xc4,0x62,0x7d,0x18,0x6a,0x20, // vbroadcastss 0x20(%rdx),%ymm13 + 0xc4,0x62,0x7d,0x18,0x52,0x2c, // vbroadcastss 0x2c(%rdx),%ymm10 + 0xc4,0x42,0x6d,0xb8,0xd5, // vfmadd231ps %ymm13,%ymm2,%ymm10 + 0xc4,0x42,0x75,0xb8,0xd4, // vfmadd231ps %ymm12,%ymm1,%ymm10 + 0xc4,0x42,0x7d,0xb8,0xd3, // vfmadd231ps %ymm11,%ymm0,%ymm10 + 0xc5,0x7c,0x29,0xc0, // vmovaps %ymm8,%ymm0 + 0xc5,0x7c,0x29,0xc9, // vmovaps %ymm9,%ymm1 + 0xc5,0x7c,0x29,0xd2, // vmovaps %ymm10,%ymm2 +}; #endif diff --git a/src/splicer/SkSplicer_stages.cpp b/src/splicer/SkSplicer_stages.cpp index c45f204e22..2d83996f7d 100644 --- a/src/splicer/SkSplicer_stages.cpp +++ b/src/splicer/SkSplicer_stages.cpp @@ -33,6 +33,7 @@ AI static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); } AI static U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); } + AI static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; } #elif defined(__ARM_NEON__) #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__) #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb. @@ -53,6 +54,7 @@ AI static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); } AI static U32 round(F v, F scale) { return vcvt_u32_f32(fma(v,scale,0.5f)); } + AI static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; } #else #if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__) #error On x86, compile with -mavx2 -mfma -mf16c. @@ -72,11 +74,20 @@ AI static F rsqrt(F v) { return _mm256_rsqrt_ps (v); } AI static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } AI static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); } + + AI static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); } #endif AI static F cast (U32 v) { return __builtin_convertvector((I32)v, F); } AI static U32 expand(U8 v) { return __builtin_convertvector( v, U32); } +template <typename T, typename P> +AI static T unaligned_load(const P* p) { + T v; + memcpy(&v, p, sizeof(v)); + return v; +} + // We'll be compiling this file to an object file, then extracting parts of it into // SkSplicer_generated.h. It's easier to do if the function names are not C++ mangled. // On ARMv7, use aapcs-vfp calling convention to pass as much data in registers as possible. @@ -241,8 +252,7 @@ STAGE(to_srgb) { STAGE(scale_u8) { auto ptr = *(const uint8_t**)ctx + x; - U8 scales; - memcpy(&scales, ptr, sizeof(scales)); + auto scales = unaligned_load<U8>(ptr); auto c = cast(expand(scales)) * k->_1_255; r = r * c; @@ -251,12 +261,24 @@ STAGE(scale_u8) { a = a * c; } +STAGE(load_tables) { + struct Ctx { + const uint32_t* src; + const float *r, *g, *b; + }; + auto c = (const Ctx*)ctx; + + auto px = unaligned_load<U32>(c->src + x); + r = gather(c->r, (px ) & k->_0x000000ff); + g = gather(c->g, (px >> 8) & k->_0x000000ff); + b = gather(c->b, (px >> 16) & k->_0x000000ff); + a = cast( (px >> 24)) * k->_1_255; +} + STAGE(load_8888) { auto ptr = *(const uint32_t**)ctx + x; - U32 px; - memcpy(&px, ptr, sizeof(px)); - + auto px = unaligned_load<U32>(ptr); r = cast((px ) & k->_0x000000ff) * k->_1_255; g = cast((px >> 8) & k->_0x000000ff) * k->_1_255; b = cast((px >> 16) & k->_0x000000ff) * k->_1_255; @@ -347,3 +369,14 @@ STAGE(store_f16) { _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567)); #endif } + +STAGE(matrix_3x4) { + auto m = (const float*)ctx; + + auto R = fma(r,m[0], fma(g,m[3], fma(b,m[6], m[ 9]))), + G = fma(r,m[1], fma(g,m[4], fma(b,m[7], m[10]))), + B = fma(r,m[2], fma(g,m[5], fma(b,m[8], m[11]))); + r = R; + g = G; + b = B; +} |