aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/splicer
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-01-13 13:18:44 -0500
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-01-13 20:30:25 +0000
commit69f98c738c48baf087b09e30f8d2c67471fc3103 (patch)
treef5b64875c982951767a023744512e318f615e920 /src/splicer
parent2873c767fdb6b5f785a88cf593bfee27d044fbb5 (diff)
SkSplicer: implement load_tables and matrix_3x4
These are enough to splice interesting SkColorSpaceXform pipelines. SkSplicer_stages.cpp is similar to but still intentionally distinct from SkRasterPipeline_opts. I hope to unify them next week. unaligned_load() is nothing tricky... just a little refactor. Change-Id: I05d0fc38dac985aa351d88776ecc14d2457f2124 Reviewed-on: https://skia-review.googlesource.com/7022 Reviewed-by: Matt Sarett <msarett@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/splicer')
-rw-r--r--src/splicer/SkSplicer.cpp2
-rw-r--r--src/splicer/SkSplicer_generated.h204
-rw-r--r--src/splicer/SkSplicer_stages.cpp43
3 files changed, 244 insertions, 5 deletions
diff --git a/src/splicer/SkSplicer.cpp b/src/splicer/SkSplicer.cpp
index fcff0afbde..0295c4f2ae 100644
--- a/src/splicer/SkSplicer.cpp
+++ b/src/splicer/SkSplicer.cpp
@@ -293,10 +293,12 @@ namespace {
case SkRasterPipeline::from_srgb: splice(&buf, kSplice_from_srgb ); break;
case SkRasterPipeline::to_srgb: splice(&buf, kSplice_to_srgb ); break;
case SkRasterPipeline::scale_u8: splice(&buf, kSplice_scale_u8 ); break;
+ case SkRasterPipeline::load_tables: splice(&buf, kSplice_load_tables ); break;
case SkRasterPipeline::load_8888: splice(&buf, kSplice_load_8888 ); break;
case SkRasterPipeline::store_8888: splice(&buf, kSplice_store_8888 ); break;
case SkRasterPipeline::load_f16: splice(&buf, kSplice_load_f16 ); break;
case SkRasterPipeline::store_f16: splice(&buf, kSplice_store_f16 ); break;
+ case SkRasterPipeline::matrix_3x4: splice(&buf, kSplice_matrix_3x4 ); break;
// No joy (probably just not yet implemented).
default:
diff --git a/src/splicer/SkSplicer_generated.h b/src/splicer/SkSplicer_generated.h
index df4bd03331..228f8ff603 100644
--- a/src/splicer/SkSplicer_generated.h
+++ b/src/splicer/SkSplicer_generated.h
@@ -222,6 +222,58 @@ static const unsigned int kSplice_scale_u8[] = {
0x6e22de02, // fmul v2.4s, v16.4s, v2.4s
0x6e23de03, // fmul v3.4s, v16.4s, v3.4s
};
+static const unsigned int kSplice_load_tables[] = {
+ 0xa9402848, // ldp x8, x10, [x2]
+ 0xd37ef409, // lsl x9, x0, #2
+ 0x4d40c860, // ld1r {v0.4s}, [x3]
+ 0x3ce96903, // ldr q3, [x8,x9]
+ 0xa9412448, // ldp x8, x9, [x2,#16]
+ 0x4e231c01, // and v1.16b, v0.16b, v3.16b
+ 0x1e26002e, // fmov w14, s1
+ 0x6f380462, // ushr v2.4s, v3.4s, #8
+ 0x6f300470, // ushr v16.4s, v3.4s, #16
+ 0x8b2e494e, // add x14, x10, w14, uxtw #2
+ 0x0e0c3c2b, // mov w11, v1.s[1]
+ 0x0e143c2c, // mov w12, v1.s[2]
+ 0x0e1c3c2d, // mov w13, v1.s[3]
+ 0x4e221c01, // and v1.16b, v0.16b, v2.16b
+ 0x4e301c02, // and v2.16b, v0.16b, v16.16b
+ 0x0d4081c0, // ld1 {v0.s}[0], [x14]
+ 0x1e26002e, // fmov w14, s1
+ 0x8b2e490e, // add x14, x8, w14, uxtw #2
+ 0x8b2b494b, // add x11, x10, w11, uxtw #2
+ 0xbc6c5950, // ldr s16, [x10,w12,uxtw #2]
+ 0xbc6d5951, // ldr s17, [x10,w13,uxtw #2]
+ 0x0e0c3c2a, // mov w10, v1.s[1]
+ 0x0e143c2c, // mov w12, v1.s[2]
+ 0x0e1c3c2d, // mov w13, v1.s[3]
+ 0x0d4081c1, // ld1 {v1.s}[0], [x14]
+ 0x0d409160, // ld1 {v0.s}[1], [x11]
+ 0xbc6c5912, // ldr s18, [x8,w12,uxtw #2]
+ 0x0e143c4c, // mov w12, v2.s[2]
+ 0x1e26004e, // fmov w14, s2
+ 0xbc6c5933, // ldr s19, [x9,w12,uxtw #2]
+ 0x8b2e492c, // add x12, x9, w14, uxtw #2
+ 0x8b2a490a, // add x10, x8, w10, uxtw #2
+ 0x0e0c3c4f, // mov w15, v2.s[1]
+ 0x0e1c3c4b, // mov w11, v2.s[3]
+ 0x0d408182, // ld1 {v2.s}[0], [x12]
+ 0x0d409141, // ld1 {v1.s}[1], [x10]
+ 0x6e140600, // mov v0.s[2], v16.s[0]
+ 0xbc6d5910, // ldr s16, [x8,w13,uxtw #2]
+ 0x8b2f492a, // add x10, x9, w15, uxtw #2
+ 0x0d409142, // ld1 {v2.s}[1], [x10]
+ 0x6e140641, // mov v1.s[2], v18.s[0]
+ 0x6e1c0620, // mov v0.s[3], v17.s[0]
+ 0xbc6b5931, // ldr s17, [x9,w11,uxtw #2]
+ 0x6e1c0601, // mov v1.s[3], v16.s[0]
+ 0xbd400c70, // ldr s16, [x3,#12]
+ 0x6f280463, // ushr v3.4s, v3.4s, #24
+ 0x6e140662, // mov v2.s[2], v19.s[0]
+ 0x4e21d863, // scvtf v3.4s, v3.4s
+ 0x6e1c0622, // mov v2.s[3], v17.s[0]
+ 0x4f909063, // fmul v3.4s, v3.4s, v16.s[0]
+};
static const unsigned int kSplice_load_8888[] = {
0xf9400048, // ldr x8, [x2]
0xd37ef409, // lsl x9, x0, #2
@@ -281,6 +333,33 @@ static const unsigned int kSplice_store_f16[] = {
0x0e216873, // fcvtn v19.4h, v3.4s
0x0c000510, // st4 {v16.4h-v19.4h}, [x8]
};
+static const unsigned int kSplice_matrix_3x4[] = {
+ 0xaa0203e8, // mov x8, x2
+ 0x91009049, // add x9, x2, #0x24
+ 0x4ddfc913, // ld1r {v19.4s}, [x8], #4
+ 0x4d40c930, // ld1r {v16.4s}, [x9]
+ 0x9100a049, // add x9, x2, #0x28
+ 0x4d40c931, // ld1r {v17.4s}, [x9]
+ 0x2d435454, // ldp s20, s21, [x2,#24]
+ 0x9100b049, // add x9, x2, #0x2c
+ 0xbd402056, // ldr s22, [x2,#32]
+ 0x4d40c932, // ld1r {v18.4s}, [x9]
+ 0x4f941050, // fmla v16.4s, v2.4s, v20.s[0]
+ 0x4f951051, // fmla v17.4s, v2.4s, v21.s[0]
+ 0x2d415454, // ldp s20, s21, [x2,#8]
+ 0x4f961052, // fmla v18.4s, v2.4s, v22.s[0]
+ 0x2d425842, // ldp s2, s22, [x2,#16]
+ 0x4f951030, // fmla v16.4s, v1.4s, v21.s[0]
+ 0xbd400115, // ldr s21, [x8]
+ 0x4f821031, // fmla v17.4s, v1.4s, v2.s[0]
+ 0x4f961032, // fmla v18.4s, v1.4s, v22.s[0]
+ 0x4e20ce70, // fmla v16.4s, v19.4s, v0.4s
+ 0x4f951011, // fmla v17.4s, v0.4s, v21.s[0]
+ 0x4f941012, // fmla v18.4s, v0.4s, v20.s[0]
+ 0x4eb01e00, // mov v0.16b, v16.16b
+ 0x4eb11e21, // mov v1.16b, v17.16b
+ 0x4eb21e42, // mov v2.16b, v18.16b
+};
#elif defined(__ARM_NEON__)
@@ -505,6 +584,44 @@ static const unsigned int kSplice_scale_u8[] = {
0xe28dd008, // add sp, sp, #8
0xecbd8b02, // vpop {d8}
};
+static const unsigned int kSplice_load_tables[] = {
+ 0xe92d41f0, // push {r4, r5, r6, r7, r8, lr}
+ 0xe592c000, // ldr ip, [r2]
+ 0xf4e30c9f, // vld1.32 {d16[]}, [r3 :32]
+ 0xe08c5100, // add r5, ip, r0, lsl #2
+ 0xe592e004, // ldr lr, [r2, #4]
+ 0xedd51b00, // vldr d17, [r5]
+ 0xf24021b1, // vand d18, d16, d17
+ 0xe592800c, // ldr r8, [r2, #12]
+ 0xf3f83031, // vshr.u32 d19, d17, #8
+ 0xe5924008, // ldr r4, [r2, #8]
+ 0xed931a03, // vldr s2, [r3, #12]
+ 0xee325b90, // vmov.32 r5, d18[1]
+ 0xee126b90, // vmov.32 r6, d18[0]
+ 0xf3f02031, // vshr.u32 d18, d17, #16
+ 0xf24021b2, // vand d18, d16, d18
+ 0xf24001b3, // vand d16, d16, d19
+ 0xee127b90, // vmov.32 r7, d18[0]
+ 0xe08e5105, // add r5, lr, r5, lsl #2
+ 0xe08e6106, // add r6, lr, r6, lsl #2
+ 0xedd50a00, // vldr s1, [r5]
+ 0xee325b90, // vmov.32 r5, d18[1]
+ 0xed960a00, // vldr s0, [r6]
+ 0xee306b90, // vmov.32 r6, d16[1]
+ 0xe0887107, // add r7, r8, r7, lsl #2
+ 0xe088c105, // add ip, r8, r5, lsl #2
+ 0xee105b90, // vmov.32 r5, d16[0]
+ 0xf3e80031, // vshr.u32 d16, d17, #24
+ 0xe0846106, // add r6, r4, r6, lsl #2
+ 0xeddc2a00, // vldr s5, [ip]
+ 0xf3fb0620, // vcvt.f32.s32 d16, d16
+ 0xed972a00, // vldr s4, [r7]
+ 0xf2a039c1, // vmul.f32 d3, d16, d1[0]
+ 0xedd61a00, // vldr s3, [r6]
+ 0xe0846105, // add r6, r4, r5, lsl #2
+ 0xed961a00, // vldr s2, [r6]
+ 0xe8bd41f0, // pop {r4, r5, r6, r7, r8, lr}
+};
static const unsigned int kSplice_load_8888[] = {
0xe592c000, // ldr ip, [r2]
0xf4e30c9f, // vld1.32 {d16[]}, [r3 :32]
@@ -581,6 +698,43 @@ static const unsigned int kSplice_store_f16[] = {
0xe08cc180, // add ip, ip, r0, lsl #3
0xf44c084f, // vst2.16 {d16-d17}, [ip]
};
+static const unsigned int kSplice_matrix_3x4[] = {
+ 0xe282c020, // add ip, r2, #32
+ 0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32]
+ 0xe282c02c, // add ip, r2, #44
+ 0xf4ec0c9f, // vld1.32 {d16[]}, [ip :32]
+ 0xe282c01c, // add ip, r2, #28
+ 0xf2420c33, // vfma.f32 d16, d2, d19
+ 0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
+ 0xe282c018, // add ip, r2, #24
+ 0xf4ec2c9f, // vld1.32 {d18[]}, [ip :32]
+ 0xe282c024, // add ip, r2, #36
+ 0xf4ec1c9f, // vld1.32 {d17[]}, [ip :32]
+ 0xe282c028, // add ip, r2, #40
+ 0xf2421c32, // vfma.f32 d17, d2, d18
+ 0xf4ec2c9f, // vld1.32 {d18[]}, [ip :32]
+ 0xe282c010, // add ip, r2, #16
+ 0xf2422c34, // vfma.f32 d18, d2, d20
+ 0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32]
+ 0xe282c00c, // add ip, r2, #12
+ 0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
+ 0xe282c014, // add ip, r2, #20
+ 0xf2411c34, // vfma.f32 d17, d1, d20
+ 0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
+ 0xf2410c34, // vfma.f32 d16, d1, d20
+ 0xe282c004, // add ip, r2, #4
+ 0xf2412c33, // vfma.f32 d18, d1, d19
+ 0xf4e23c9f, // vld1.32 {d19[]}, [r2 :32]
+ 0xf4ec4c9f, // vld1.32 {d20[]}, [ip :32]
+ 0xe282c008, // add ip, r2, #8
+ 0xf2401c33, // vfma.f32 d17, d0, d19
+ 0xf4ec3c9f, // vld1.32 {d19[]}, [ip :32]
+ 0xf2400c33, // vfma.f32 d16, d0, d19
+ 0xf2402c34, // vfma.f32 d18, d0, d20
+ 0xf22101b1, // vorr d0, d17, d17
+ 0xf22021b0, // vorr d2, d16, d16
+ 0xf22211b2, // vorr d1, d18, d18
+};
#else
@@ -747,6 +901,30 @@ static const unsigned char kSplice_scale_u8[] = {
0xc5,0xbc,0x59,0xd2, // vmulps %ymm2,%ymm8,%ymm2
0xc5,0xbc,0x59,0xdb, // vmulps %ymm3,%ymm8,%ymm3
};
+static const unsigned char kSplice_load_tables[] = {
+ 0x48,0x8b,0x02, // mov (%rdx),%rax
+ 0x4c,0x8b,0x42,0x08, // mov 0x8(%rdx),%r8
+ 0xc5,0xfc,0x10,0x1c,0xb8, // vmovups (%rax,%rdi,4),%ymm3
+ 0xc4,0xe2,0x7d,0x18,0x11, // vbroadcastss (%rcx),%ymm2
+ 0xc5,0xec,0x54,0xcb, // vandps %ymm3,%ymm2,%ymm1
+ 0xc5,0xfc,0x57,0xc0, // vxorps %ymm0,%ymm0,%ymm0
+ 0xc5,0x7c,0xc2,0xc0,0x00, // vcmpeqps %ymm0,%ymm0,%ymm8
+ 0xc4,0x41,0x7c,0x28,0xc8, // vmovaps %ymm8,%ymm9
+ 0xc4,0xc2,0x35,0x92,0x04,0x88, // vgatherdps %ymm9,(%r8,%ymm1,4),%ymm0
+ 0x48,0x8b,0x42,0x10, // mov 0x10(%rdx),%rax
+ 0xc5,0xf5,0x72,0xd3,0x08, // vpsrld $0x8,%ymm3,%ymm1
+ 0xc5,0x6c,0x54,0xc9, // vandps %ymm1,%ymm2,%ymm9
+ 0xc4,0x41,0x7c,0x28,0xd0, // vmovaps %ymm8,%ymm10
+ 0xc4,0xa2,0x2d,0x92,0x0c,0x88, // vgatherdps %ymm10,(%rax,%ymm9,4),%ymm1
+ 0x48,0x8b,0x42,0x18, // mov 0x18(%rdx),%rax
+ 0xc5,0xb5,0x72,0xd3,0x10, // vpsrld $0x10,%ymm3,%ymm9
+ 0xc4,0x41,0x6c,0x54,0xc9, // vandps %ymm9,%ymm2,%ymm9
+ 0xc4,0xa2,0x3d,0x92,0x14,0x88, // vgatherdps %ymm8,(%rax,%ymm9,4),%ymm2
+ 0xc5,0xe5,0x72,0xd3,0x18, // vpsrld $0x18,%ymm3,%ymm3
+ 0xc5,0xfc,0x5b,0xdb, // vcvtdq2ps %ymm3,%ymm3
+ 0xc4,0x62,0x7d,0x18,0x41,0x0c, // vbroadcastss 0xc(%rcx),%ymm8
+ 0xc4,0xc1,0x64,0x59,0xd8, // vmulps %ymm8,%ymm3,%ymm3
+};
static const unsigned char kSplice_load_8888[] = {
0x48,0x8b,0x02, // mov (%rdx),%rax
0xc5,0xfc,0x10,0x1c,0xb8, // vmovups (%rax,%rdi,4),%ymm3
@@ -828,6 +1006,32 @@ static const unsigned char kSplice_store_f16[] = {
0xc4,0x41,0x39,0x6a,0xc2, // vpunpckhdq %xmm10,%xmm8,%xmm8
0xc5,0x7a,0x7f,0x44,0xf8,0x30, // vmovdqu %xmm8,0x30(%rax,%rdi,8)
};
+static const unsigned char kSplice_matrix_3x4[] = {
+ 0xc4,0x62,0x7d,0x18,0x0a, // vbroadcastss (%rdx),%ymm9
+ 0xc4,0x62,0x7d,0x18,0x52,0x0c, // vbroadcastss 0xc(%rdx),%ymm10
+ 0xc4,0x62,0x7d,0x18,0x5a,0x18, // vbroadcastss 0x18(%rdx),%ymm11
+ 0xc4,0x62,0x7d,0x18,0x42,0x24, // vbroadcastss 0x24(%rdx),%ymm8
+ 0xc4,0x42,0x6d,0xb8,0xc3, // vfmadd231ps %ymm11,%ymm2,%ymm8
+ 0xc4,0x42,0x75,0xb8,0xc2, // vfmadd231ps %ymm10,%ymm1,%ymm8
+ 0xc4,0x42,0x7d,0xb8,0xc1, // vfmadd231ps %ymm9,%ymm0,%ymm8
+ 0xc4,0x62,0x7d,0x18,0x52,0x04, // vbroadcastss 0x4(%rdx),%ymm10
+ 0xc4,0x62,0x7d,0x18,0x5a,0x10, // vbroadcastss 0x10(%rdx),%ymm11
+ 0xc4,0x62,0x7d,0x18,0x62,0x1c, // vbroadcastss 0x1c(%rdx),%ymm12
+ 0xc4,0x62,0x7d,0x18,0x4a,0x28, // vbroadcastss 0x28(%rdx),%ymm9
+ 0xc4,0x42,0x6d,0xb8,0xcc, // vfmadd231ps %ymm12,%ymm2,%ymm9
+ 0xc4,0x42,0x75,0xb8,0xcb, // vfmadd231ps %ymm11,%ymm1,%ymm9
+ 0xc4,0x42,0x7d,0xb8,0xca, // vfmadd231ps %ymm10,%ymm0,%ymm9
+ 0xc4,0x62,0x7d,0x18,0x5a,0x08, // vbroadcastss 0x8(%rdx),%ymm11
+ 0xc4,0x62,0x7d,0x18,0x62,0x14, // vbroadcastss 0x14(%rdx),%ymm12
+ 0xc4,0x62,0x7d,0x18,0x6a,0x20, // vbroadcastss 0x20(%rdx),%ymm13
+ 0xc4,0x62,0x7d,0x18,0x52,0x2c, // vbroadcastss 0x2c(%rdx),%ymm10
+ 0xc4,0x42,0x6d,0xb8,0xd5, // vfmadd231ps %ymm13,%ymm2,%ymm10
+ 0xc4,0x42,0x75,0xb8,0xd4, // vfmadd231ps %ymm12,%ymm1,%ymm10
+ 0xc4,0x42,0x7d,0xb8,0xd3, // vfmadd231ps %ymm11,%ymm0,%ymm10
+ 0xc5,0x7c,0x29,0xc0, // vmovaps %ymm8,%ymm0
+ 0xc5,0x7c,0x29,0xc9, // vmovaps %ymm9,%ymm1
+ 0xc5,0x7c,0x29,0xd2, // vmovaps %ymm10,%ymm2
+};
#endif
diff --git a/src/splicer/SkSplicer_stages.cpp b/src/splicer/SkSplicer_stages.cpp
index c45f204e22..2d83996f7d 100644
--- a/src/splicer/SkSplicer_stages.cpp
+++ b/src/splicer/SkSplicer_stages.cpp
@@ -33,6 +33,7 @@
AI static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
AI static U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
+ AI static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
#elif defined(__ARM_NEON__)
#if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
#error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
@@ -53,6 +54,7 @@
AI static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
AI static U32 round(F v, F scale) { return vcvt_u32_f32(fma(v,scale,0.5f)); }
+ AI static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
#else
#if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
#error On x86, compile with -mavx2 -mfma -mf16c.
@@ -72,11 +74,20 @@
AI static F rsqrt(F v) { return _mm256_rsqrt_ps (v); }
AI static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
AI static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
+
+ AI static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); }
#endif
AI static F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
AI static U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
+template <typename T, typename P>
+AI static T unaligned_load(const P* p) {
+ T v;
+ memcpy(&v, p, sizeof(v));
+ return v;
+}
+
// We'll be compiling this file to an object file, then extracting parts of it into
// SkSplicer_generated.h. It's easier to do if the function names are not C++ mangled.
// On ARMv7, use aapcs-vfp calling convention to pass as much data in registers as possible.
@@ -241,8 +252,7 @@ STAGE(to_srgb) {
STAGE(scale_u8) {
auto ptr = *(const uint8_t**)ctx + x;
- U8 scales;
- memcpy(&scales, ptr, sizeof(scales));
+ auto scales = unaligned_load<U8>(ptr);
auto c = cast(expand(scales)) * k->_1_255;
r = r * c;
@@ -251,12 +261,24 @@ STAGE(scale_u8) {
a = a * c;
}
+STAGE(load_tables) {
+ struct Ctx {
+ const uint32_t* src;
+ const float *r, *g, *b;
+ };
+ auto c = (const Ctx*)ctx;
+
+ auto px = unaligned_load<U32>(c->src + x);
+ r = gather(c->r, (px ) & k->_0x000000ff);
+ g = gather(c->g, (px >> 8) & k->_0x000000ff);
+ b = gather(c->b, (px >> 16) & k->_0x000000ff);
+ a = cast( (px >> 24)) * k->_1_255;
+}
+
STAGE(load_8888) {
auto ptr = *(const uint32_t**)ctx + x;
- U32 px;
- memcpy(&px, ptr, sizeof(px));
-
+ auto px = unaligned_load<U32>(ptr);
r = cast((px ) & k->_0x000000ff) * k->_1_255;
g = cast((px >> 8) & k->_0x000000ff) * k->_1_255;
b = cast((px >> 16) & k->_0x000000ff) * k->_1_255;
@@ -347,3 +369,14 @@ STAGE(store_f16) {
_mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
#endif
}
+
+STAGE(matrix_3x4) {
+ auto m = (const float*)ctx;
+
+ auto R = fma(r,m[0], fma(g,m[3], fma(b,m[6], m[ 9]))),
+ G = fma(r,m[1], fma(g,m[4], fma(b,m[7], m[10]))),
+ B = fma(r,m[2], fma(g,m[5], fma(b,m[8], m[11])));
+ r = R;
+ g = G;
+ b = B;
+}