aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-03-07 12:28:11 -0500
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-03-07 18:08:46 +0000
commite9ed07de16de59959d18ab56a8f62b66f210955d (patch)
treec1004747655f7748edd9b1ef49add628b265b568
parentc3cc5fa6de0a8237d9241dbf3e6c0786a9040069 (diff)
SkJumper: a couple stages used by color filters
Nothing too interesting here. - 4x5 matrix is the same as 2x3 and 3x4, but bigger. - luminance to alpha is pretty boring indeed. Change-Id: I24893df15d1e22cbb500d946e1bbb58aa1522c8c Reviewed-on: https://skia-review.googlesource.com/9385 Reviewed-by: Mike Klein <mtklein@chromium.org> Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
-rw-r--r--src/jumper/SkJumper.cpp4
-rw-r--r--src/jumper/SkJumper.h5
-rw-r--r--src/jumper/SkJumper_generated.cpp822
-rw-r--r--src/jumper/SkJumper_stages.cpp17
4 files changed, 848 insertions, 0 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index ce55b87e38..caa9db2fd3 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -5,6 +5,7 @@
* found in the LICENSE file.
*/
+#include "SkColorPriv.h"
#include "SkCpu.h"
#include "SkJumper.h"
#include "SkRasterPipeline.h"
@@ -44,6 +45,7 @@ static K kConstants = {
0x0000f800, 0x000007e0, 0x0000001f, // 565
1.0f/0x0000f800, 1.0f/0x000007e0, 1.0f/0x0000001f,
31.0f, 63.0f,
+ SK_LUM_COEFF_R, SK_LUM_COEFF_G, SK_LUM_COEFF_B, // luminance -> alpha
};
#define STAGES(M) \
@@ -80,8 +82,10 @@ static K kConstants = {
M(load_f16) \
M(store_f16) \
M(store_f32) \
+ M(luminance_to_alpha) \
M(matrix_2x3) \
M(matrix_3x4) \
+ M(matrix_4x5) \
M(matrix_perspective) \
M(clamp_x) \
M(clamp_y) \
diff --git a/src/jumper/SkJumper.h b/src/jumper/SkJumper.h
index 312110a06d..215284b84c 100644
--- a/src/jumper/SkJumper.h
+++ b/src/jumper/SkJumper.h
@@ -57,6 +57,11 @@ struct SkJumper_constants {
float b_565_scale;
float _31;
float _63;
+
+ // luminance -> alpha
+ float lum_r;
+ float lum_g;
+ float lum_b;
};
#endif//SkJumper_DEFINED
diff --git a/src/jumper/SkJumper_generated.cpp b/src/jumper/SkJumper_generated.cpp
index 234cbc7047..ce7fc44945 100644
--- a/src/jumper/SkJumper_generated.cpp
+++ b/src/jumper/SkJumper_generated.cpp
@@ -784,6 +784,19 @@ CODE const uint32_t sk_mirror_y_aarch64[] = {
0xd61f0060, //br x3
};
+CODE const uint32_t sk_luminance_to_alpha_aarch64[] = {
+ 0x2d510c50, //ldp s16, s3, [x2, #136]
+ 0xbd409051, //ldr s17, [x2, #144]
+ 0xf8408423, //ldr x3, [x1], #8
+ 0x4f839023, //fmul v3.4s, v1.4s, v3.s[0]
+ 0x4f901003, //fmla v3.4s, v0.4s, v16.s[0]
+ 0x6f00e400, //movi v0.2d, #0x0
+ 0x6f00e401, //movi v1.2d, #0x0
+ 0x4f911043, //fmla v3.4s, v2.4s, v17.s[0]
+ 0x6f00e402, //movi v2.2d, #0x0
+ 0xd61f0060, //br x3
+};
+
CODE const uint32_t sk_matrix_2x3_aarch64[] = {
0xa8c10c28, //ldp x8, x3, [x1], #16
0xaa0803e9, //mov x9, x8
@@ -833,6 +846,51 @@ CODE const uint32_t sk_matrix_3x4_aarch64[] = {
0xd61f0060, //br x3
};
+CODE const uint32_t sk_matrix_4x5_aarch64[] = {
+ 0xf9400029, //ldr x9, [x1]
+ 0xaa0903e8, //mov x8, x9
+ 0x9101012a, //add x10, x9, #0x40
+ 0x4ddfc914, //ld1r {v20.4s}, [x8], #4
+ 0x4d40c950, //ld1r {v16.4s}, [x10]
+ 0x9101112a, //add x10, x9, #0x44
+ 0x4d40c951, //ld1r {v17.4s}, [x10]
+ 0x9101212a, //add x10, x9, #0x48
+ 0x4d40c952, //ld1r {v18.4s}, [x10]
+ 0x2d465533, //ldp s19, s21, [x9, #48]
+ 0x2d475d36, //ldp s22, s23, [x9, #56]
+ 0x9101312a, //add x10, x9, #0x4c
+ 0xf9400423, //ldr x3, [x1, #8]
+ 0x4f931070, //fmla v16.4s, v3.4s, v19.s[0]
+ 0x4d40c953, //ld1r {v19.4s}, [x10]
+ 0x4f951071, //fmla v17.4s, v3.4s, v21.s[0]
+ 0x4f961072, //fmla v18.4s, v3.4s, v22.s[0]
+ 0x2d445935, //ldp s21, s22, [x9, #32]
+ 0x4f971073, //fmla v19.4s, v3.4s, v23.s[0]
+ 0x2d455d23, //ldp s3, s23, [x9, #40]
+ 0x91004021, //add x1, x1, #0x10
+ 0x4f951050, //fmla v16.4s, v2.4s, v21.s[0]
+ 0x4f961051, //fmla v17.4s, v2.4s, v22.s[0]
+ 0x2d425935, //ldp s21, s22, [x9, #16]
+ 0x4f971053, //fmla v19.4s, v2.4s, v23.s[0]
+ 0x4f831052, //fmla v18.4s, v2.4s, v3.s[0]
+ 0x2d410d22, //ldp s2, s3, [x9, #8]
+ 0x4f951030, //fmla v16.4s, v1.4s, v21.s[0]
+ 0x2d435d35, //ldp s21, s23, [x9, #24]
+ 0x4f961031, //fmla v17.4s, v1.4s, v22.s[0]
+ 0xbd400116, //ldr s22, [x8]
+ 0x4e20ce90, //fmla v16.4s, v20.4s, v0.4s
+ 0x4f951032, //fmla v18.4s, v1.4s, v21.s[0]
+ 0x4f971033, //fmla v19.4s, v1.4s, v23.s[0]
+ 0x4f821012, //fmla v18.4s, v0.4s, v2.s[0]
+ 0x4f831013, //fmla v19.4s, v0.4s, v3.s[0]
+ 0x4f961011, //fmla v17.4s, v0.4s, v22.s[0]
+ 0x4eb01e00, //mov v0.16b, v16.16b
+ 0x4eb11e21, //mov v1.16b, v17.16b
+ 0x4eb21e42, //mov v2.16b, v18.16b
+ 0x4eb31e63, //mov v3.16b, v19.16b
+ 0xd61f0060, //br x3
+};
+
CODE const uint32_t sk_matrix_perspective_aarch64[] = {
0xa8c10c28, //ldp x8, x3, [x1], #16
0xaa0803e9, //mov x9, x8
@@ -1723,6 +1781,24 @@ CODE const uint32_t sk_mirror_y_vfp4[] = {
0xe12fff1c, //bx ip
};
+CODE const uint32_t sk_luminance_to_alpha_vfp4[] = {
+ 0xed2d8b02, //vpush {d8}
+ 0xed923a22, //vldr s6, [r2, #136]
+ 0xe2823090, //add r3, r2, #144
+ 0xed928a23, //vldr s16, [r2, #140]
+ 0xf2e01943, //vmul.f32 d17, d0, d3[0]
+ 0xf2e10948, //vmul.f32 d16, d1, d8[0]
+ 0xf2800010, //vmov.i32 d0, #0
+ 0xf2801010, //vmov.i32 d1, #0
+ 0xf2013da0, //vadd.f32 d3, d17, d16
+ 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32]
+ 0xf2003c92, //vfma.f32 d3, d16, d2
+ 0xe4913004, //ldr r3, [r1], #4
+ 0xf2802010, //vmov.i32 d2, #0
+ 0xecbd8b02, //vpop {d8}
+ 0xe12fff13, //bx r3
+};
+
CODE const uint32_t sk_matrix_2x3_vfp4[] = {
0xe92d4800, //push {fp, lr}
0xe591e000, //ldr lr, [r1]
@@ -1793,6 +1869,74 @@ CODE const uint32_t sk_matrix_3x4_vfp4[] = {
0xe12fff1c, //bx ip
};
+CODE const uint32_t sk_matrix_4x5_vfp4[] = {
+ 0xe92d4800, //push {fp, lr}
+ 0xe591e000, //ldr lr, [r1]
+ 0xf2630113, //vorr d16, d3, d3
+ 0xf2621112, //vorr d17, d2, d2
+ 0xe591c004, //ldr ip, [r1, #4]
+ 0xe28e301c, //add r3, lr, #28
+ 0xe2811008, //add r1, r1, #8
+ 0xf4ee4c9f, //vld1.32 {d20[]}, [lr :32]
+ 0xf4e35c9f, //vld1.32 {d21[]}, [r3 :32]
+ 0xe28e302c, //add r3, lr, #44
+ 0xf4e36c9f, //vld1.32 {d22[]}, [r3 :32]
+ 0xe28e303c, //add r3, lr, #60
+ 0xf4e37c9f, //vld1.32 {d23[]}, [r3 :32]
+ 0xe28e304c, //add r3, lr, #76
+ 0xf4a33c9f, //vld1.32 {d3[]}, [r3 :32]
+ 0xe28e3038, //add r3, lr, #56
+ 0xf2003cb7, //vfma.f32 d3, d16, d23
+ 0xf4e38c9f, //vld1.32 {d24[]}, [r3 :32]
+ 0xe28e3048, //add r3, lr, #72
+ 0xf4a32c9f, //vld1.32 {d2[]}, [r3 :32]
+ 0xe28e3034, //add r3, lr, #52
+ 0xf2002cb8, //vfma.f32 d2, d16, d24
+ 0xf4e39c9f, //vld1.32 {d25[]}, [r3 :32]
+ 0xe28e3030, //add r3, lr, #48
+ 0xf4e33c9f, //vld1.32 {d19[]}, [r3 :32]
+ 0xe28e3040, //add r3, lr, #64
+ 0xf2013cb6, //vfma.f32 d3, d17, d22
+ 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32]
+ 0xe28e3044, //add r3, lr, #68
+ 0xf2402cb3, //vfma.f32 d18, d16, d19
+ 0xf4e33c9f, //vld1.32 {d19[]}, [r3 :32]
+ 0xe28e3024, //add r3, lr, #36
+ 0xf2403cb9, //vfma.f32 d19, d16, d25
+ 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32]
+ 0xe28e3020, //add r3, lr, #32
+ 0xf4e37c9f, //vld1.32 {d23[]}, [r3 :32]
+ 0xe28e3028, //add r3, lr, #40
+ 0xf2013c35, //vfma.f32 d3, d1, d21
+ 0xf2412cb7, //vfma.f32 d18, d17, d23
+ 0xf4e37c9f, //vld1.32 {d23[]}, [r3 :32]
+ 0xe28e3014, //add r3, lr, #20
+ 0xf2012cb7, //vfma.f32 d2, d17, d23
+ 0xf2413cb0, //vfma.f32 d19, d17, d16
+ 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32]
+ 0xe28e3010, //add r3, lr, #16
+ 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32]
+ 0xe28e3018, //add r3, lr, #24
+ 0xf2412c31, //vfma.f32 d18, d1, d17
+ 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32]
+ 0xe28e3008, //add r3, lr, #8
+ 0xf2012c31, //vfma.f32 d2, d1, d17
+ 0xf2413c30, //vfma.f32 d19, d1, d16
+ 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32]
+ 0xe28e3004, //add r3, lr, #4
+ 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32]
+ 0xe28e300c, //add r3, lr, #12
+ 0xf2402c34, //vfma.f32 d18, d0, d20
+ 0xf2002c30, //vfma.f32 d2, d0, d16
+ 0xf2403c31, //vfma.f32 d19, d0, d17
+ 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32]
+ 0xf2003c31, //vfma.f32 d3, d0, d17
+ 0xf22201b2, //vorr d0, d18, d18
+ 0xf22311b3, //vorr d1, d19, d19
+ 0xe8bd4800, //pop {fp, lr}
+ 0xe12fff1c, //bx ip
+};
+
CODE const uint32_t sk_matrix_perspective_vfp4[] = {
0xe92d4800, //push {fp, lr}
0xe591e000, //ldr lr, [r1]
@@ -2911,6 +3055,20 @@ CODE const uint8_t sk_mirror_y_hsw[] = {
255,224, //jmpq *%rax
};
+CODE const uint8_t sk_luminance_to_alpha_hsw[] = {
+ 196,98,125,24,130,136,0,0,0, //vbroadcastss 0x88(%rdx),%ymm8
+ 196,226,125,24,154,140,0,0,0, //vbroadcastss 0x8c(%rdx),%ymm3
+ 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1
+ 196,98,125,168,193, //vfmadd213ps %ymm1,%ymm0,%ymm8
+ 196,226,125,24,154,144,0,0,0, //vbroadcastss 0x90(%rdx),%ymm3
+ 196,194,109,168,216, //vfmadd213ps %ymm8,%ymm2,%ymm3
+ 72,173, //lods %ds:(%rsi),%rax
+ 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
+ 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
+ 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
+ 255,224, //jmpq *%rax
+};
+
CODE const uint8_t sk_matrix_2x3_hsw[] = {
72,173, //lods %ds:(%rsi),%rax
196,98,125,24,8, //vbroadcastss (%rax),%ymm9
@@ -2959,6 +3117,52 @@ CODE const uint8_t sk_matrix_3x4_hsw[] = {
255,224, //jmpq *%rax
};
+CODE const uint8_t sk_matrix_4x5_hsw[] = {
+ 72,173, //lods %ds:(%rsi),%rax
+ 196,98,125,24,8, //vbroadcastss (%rax),%ymm9
+ 196,98,125,24,80,16, //vbroadcastss 0x10(%rax),%ymm10
+ 196,98,125,24,88,32, //vbroadcastss 0x20(%rax),%ymm11
+ 196,98,125,24,96,48, //vbroadcastss 0x30(%rax),%ymm12
+ 196,98,125,24,64,64, //vbroadcastss 0x40(%rax),%ymm8
+ 196,66,101,184,196, //vfmadd231ps %ymm12,%ymm3,%ymm8
+ 196,66,109,184,195, //vfmadd231ps %ymm11,%ymm2,%ymm8
+ 196,66,117,184,194, //vfmadd231ps %ymm10,%ymm1,%ymm8
+ 196,66,125,184,193, //vfmadd231ps %ymm9,%ymm0,%ymm8
+ 196,98,125,24,80,4, //vbroadcastss 0x4(%rax),%ymm10
+ 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11
+ 196,98,125,24,96,36, //vbroadcastss 0x24(%rax),%ymm12
+ 196,98,125,24,104,52, //vbroadcastss 0x34(%rax),%ymm13
+ 196,98,125,24,72,68, //vbroadcastss 0x44(%rax),%ymm9
+ 196,66,101,184,205, //vfmadd231ps %ymm13,%ymm3,%ymm9
+ 196,66,109,184,204, //vfmadd231ps %ymm12,%ymm2,%ymm9
+ 196,66,117,184,203, //vfmadd231ps %ymm11,%ymm1,%ymm9
+ 196,66,125,184,202, //vfmadd231ps %ymm10,%ymm0,%ymm9
+ 196,98,125,24,88,8, //vbroadcastss 0x8(%rax),%ymm11
+ 196,98,125,24,96,24, //vbroadcastss 0x18(%rax),%ymm12
+ 196,98,125,24,104,40, //vbroadcastss 0x28(%rax),%ymm13
+ 196,98,125,24,112,56, //vbroadcastss 0x38(%rax),%ymm14
+ 196,98,125,24,80,72, //vbroadcastss 0x48(%rax),%ymm10
+ 196,66,101,184,214, //vfmadd231ps %ymm14,%ymm3,%ymm10
+ 196,66,109,184,213, //vfmadd231ps %ymm13,%ymm2,%ymm10
+ 196,66,117,184,212, //vfmadd231ps %ymm12,%ymm1,%ymm10
+ 196,66,125,184,211, //vfmadd231ps %ymm11,%ymm0,%ymm10
+ 196,98,125,24,96,12, //vbroadcastss 0xc(%rax),%ymm12
+ 196,98,125,24,104,28, //vbroadcastss 0x1c(%rax),%ymm13
+ 196,98,125,24,112,44, //vbroadcastss 0x2c(%rax),%ymm14
+ 196,98,125,24,120,60, //vbroadcastss 0x3c(%rax),%ymm15
+ 196,98,125,24,88,76, //vbroadcastss 0x4c(%rax),%ymm11
+ 196,66,101,184,223, //vfmadd231ps %ymm15,%ymm3,%ymm11
+ 196,66,109,184,222, //vfmadd231ps %ymm14,%ymm2,%ymm11
+ 196,66,117,184,221, //vfmadd231ps %ymm13,%ymm1,%ymm11
+ 196,66,125,184,220, //vfmadd231ps %ymm12,%ymm0,%ymm11
+ 72,173, //lods %ds:(%rsi),%rax
+ 197,124,41,192, //vmovaps %ymm8,%ymm0
+ 197,124,41,201, //vmovaps %ymm9,%ymm1
+ 197,124,41,210, //vmovaps %ymm10,%ymm2
+ 197,124,41,219, //vmovaps %ymm11,%ymm3
+ 255,224, //jmpq *%rax
+};
+
CODE const uint8_t sk_matrix_perspective_hsw[] = {
72,173, //lods %ds:(%rsi),%rax
196,98,125,24,0, //vbroadcastss (%rax),%ymm8
@@ -4395,6 +4599,22 @@ CODE const uint8_t sk_mirror_y_avx[] = {
255,224, //jmpq *%rax
};
+CODE const uint8_t sk_luminance_to_alpha_avx[] = {
+ 196,226,125,24,154,136,0,0,0, //vbroadcastss 0x88(%rdx),%ymm3
+ 197,228,89,192, //vmulps %ymm0,%ymm3,%ymm0
+ 196,226,125,24,154,140,0,0,0, //vbroadcastss 0x8c(%rdx),%ymm3
+ 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1
+ 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0
+ 196,226,125,24,138,144,0,0,0, //vbroadcastss 0x90(%rdx),%ymm1
+ 197,244,89,202, //vmulps %ymm2,%ymm1,%ymm1
+ 197,252,88,217, //vaddps %ymm1,%ymm0,%ymm3
+ 72,173, //lods %ds:(%rsi),%rax
+ 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
+ 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
+ 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
+ 255,224, //jmpq *%rax
+};
+
CODE const uint8_t sk_matrix_2x3_avx[] = {
72,173, //lods %ds:(%rsi),%rax
196,98,125,24,0, //vbroadcastss (%rax),%ymm8
@@ -4454,6 +4674,67 @@ CODE const uint8_t sk_matrix_3x4_avx[] = {
255,224, //jmpq *%rax
};
+CODE const uint8_t sk_matrix_4x5_avx[] = {
+ 72,173, //lods %ds:(%rsi),%rax
+ 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
+ 196,98,125,24,72,16, //vbroadcastss 0x10(%rax),%ymm9
+ 196,98,125,24,80,32, //vbroadcastss 0x20(%rax),%ymm10
+ 196,98,125,24,88,48, //vbroadcastss 0x30(%rax),%ymm11
+ 196,98,125,24,96,64, //vbroadcastss 0x40(%rax),%ymm12
+ 197,36,89,219, //vmulps %ymm3,%ymm11,%ymm11
+ 196,65,36,88,220, //vaddps %ymm12,%ymm11,%ymm11
+ 197,44,89,210, //vmulps %ymm2,%ymm10,%ymm10
+ 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10
+ 197,52,89,201, //vmulps %ymm1,%ymm9,%ymm9
+ 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9
+ 197,60,89,192, //vmulps %ymm0,%ymm8,%ymm8
+ 196,65,60,88,193, //vaddps %ymm9,%ymm8,%ymm8
+ 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9
+ 196,98,125,24,80,20, //vbroadcastss 0x14(%rax),%ymm10
+ 196,98,125,24,88,36, //vbroadcastss 0x24(%rax),%ymm11
+ 196,98,125,24,96,52, //vbroadcastss 0x34(%rax),%ymm12
+ 196,98,125,24,104,68, //vbroadcastss 0x44(%rax),%ymm13
+ 197,28,89,227, //vmulps %ymm3,%ymm12,%ymm12
+ 196,65,28,88,229, //vaddps %ymm13,%ymm12,%ymm12
+ 197,36,89,218, //vmulps %ymm2,%ymm11,%ymm11
+ 196,65,36,88,220, //vaddps %ymm12,%ymm11,%ymm11
+ 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10
+ 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10
+ 197,52,89,200, //vmulps %ymm0,%ymm9,%ymm9
+ 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9
+ 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10
+ 196,98,125,24,88,24, //vbroadcastss 0x18(%rax),%ymm11
+ 196,98,125,24,96,40, //vbroadcastss 0x28(%rax),%ymm12
+ 196,98,125,24,104,56, //vbroadcastss 0x38(%rax),%ymm13
+ 196,98,125,24,112,72, //vbroadcastss 0x48(%rax),%ymm14
+ 197,20,89,235, //vmulps %ymm3,%ymm13,%ymm13
+ 196,65,20,88,238, //vaddps %ymm14,%ymm13,%ymm13
+ 197,28,89,226, //vmulps %ymm2,%ymm12,%ymm12
+ 196,65,28,88,229, //vaddps %ymm13,%ymm12,%ymm12
+ 197,36,89,217, //vmulps %ymm1,%ymm11,%ymm11
+ 196,65,36,88,220, //vaddps %ymm12,%ymm11,%ymm11
+ 197,44,89,208, //vmulps %ymm0,%ymm10,%ymm10
+ 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10
+ 196,98,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm11
+ 196,98,125,24,96,28, //vbroadcastss 0x1c(%rax),%ymm12
+ 196,98,125,24,104,44, //vbroadcastss 0x2c(%rax),%ymm13
+ 196,98,125,24,112,60, //vbroadcastss 0x3c(%rax),%ymm14
+ 196,98,125,24,120,76, //vbroadcastss 0x4c(%rax),%ymm15
+ 197,140,89,219, //vmulps %ymm3,%ymm14,%ymm3
+ 196,193,100,88,223, //vaddps %ymm15,%ymm3,%ymm3
+ 197,148,89,210, //vmulps %ymm2,%ymm13,%ymm2
+ 197,236,88,211, //vaddps %ymm3,%ymm2,%ymm2
+ 197,156,89,201, //vmulps %ymm1,%ymm12,%ymm1
+ 197,244,88,202, //vaddps %ymm2,%ymm1,%ymm1
+ 197,164,89,192, //vmulps %ymm0,%ymm11,%ymm0
+ 197,252,88,217, //vaddps %ymm1,%ymm0,%ymm3
+ 72,173, //lods %ds:(%rsi),%rax
+ 197,124,41,192, //vmovaps %ymm8,%ymm0
+ 197,124,41,201, //vmovaps %ymm9,%ymm1
+ 197,124,41,210, //vmovaps %ymm10,%ymm2
+ 255,224, //jmpq *%rax
+};
+
CODE const uint8_t sk_matrix_perspective_avx[] = {
72,173, //lods %ds:(%rsi),%rax
196,98,125,24,0, //vbroadcastss (%rax),%ymm8
@@ -5391,6 +5672,25 @@ CODE const uint8_t sk_mirror_y_sse41[] = {
255,224, //jmpq *%rax
};
+CODE const uint8_t sk_luminance_to_alpha_sse41[] = {
+ 243,15,16,154,136,0,0,0, //movss 0x88(%rdx),%xmm3
+ 243,68,15,16,130,140,0,0,0, //movss 0x8c(%rdx),%xmm8
+ 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
+ 15,89,216, //mulps %xmm0,%xmm3
+ 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
+ 68,15,89,193, //mulps %xmm1,%xmm8
+ 68,15,88,195, //addps %xmm3,%xmm8
+ 243,15,16,154,144,0,0,0, //movss 0x90(%rdx),%xmm3
+ 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
+ 15,89,218, //mulps %xmm2,%xmm3
+ 65,15,88,216, //addps %xmm8,%xmm3
+ 72,173, //lods %ds:(%rsi),%rax
+ 15,87,192, //xorps %xmm0,%xmm0
+ 15,87,201, //xorps %xmm1,%xmm1
+ 15,87,210, //xorps %xmm2,%xmm2
+ 255,224, //jmpq *%rax
+};
+
CODE const uint8_t sk_matrix_2x3_sse41[] = {
68,15,40,201, //movaps %xmm1,%xmm9
68,15,40,192, //movaps %xmm0,%xmm8
@@ -5470,6 +5770,88 @@ CODE const uint8_t sk_matrix_3x4_sse41[] = {
255,224, //jmpq *%rax
};
+CODE const uint8_t sk_matrix_4x5_sse41[] = {
+ 68,15,40,201, //movaps %xmm1,%xmm9
+ 68,15,40,192, //movaps %xmm0,%xmm8
+ 72,173, //lods %ds:(%rsi),%rax
+ 243,15,16,0, //movss (%rax),%xmm0
+ 243,15,16,72,4, //movss 0x4(%rax),%xmm1
+ 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
+ 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10
+ 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
+ 243,68,15,16,88,32, //movss 0x20(%rax),%xmm11
+ 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
+ 243,68,15,16,96,48, //movss 0x30(%rax),%xmm12
+ 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
+ 243,68,15,16,104,64, //movss 0x40(%rax),%xmm13
+ 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
+ 68,15,89,227, //mulps %xmm3,%xmm12
+ 69,15,88,229, //addps %xmm13,%xmm12
+ 68,15,89,218, //mulps %xmm2,%xmm11
+ 69,15,88,220, //addps %xmm12,%xmm11
+ 69,15,89,209, //mulps %xmm9,%xmm10
+ 69,15,88,211, //addps %xmm11,%xmm10
+ 65,15,89,192, //mulps %xmm8,%xmm0
+ 65,15,88,194, //addps %xmm10,%xmm0
+ 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
+ 243,68,15,16,80,20, //movss 0x14(%rax),%xmm10
+ 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
+ 243,68,15,16,88,36, //movss 0x24(%rax),%xmm11
+ 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
+ 243,68,15,16,96,52, //movss 0x34(%rax),%xmm12
+ 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
+ 243,68,15,16,104,68, //movss 0x44(%rax),%xmm13
+ 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
+ 68,15,89,227, //mulps %xmm3,%xmm12
+ 69,15,88,229, //addps %xmm13,%xmm12
+ 68,15,89,218, //mulps %xmm2,%xmm11
+ 69,15,88,220, //addps %xmm12,%xmm11
+ 69,15,89,209, //mulps %xmm9,%xmm10
+ 69,15,88,211, //addps %xmm11,%xmm10
+ 65,15,89,200, //mulps %xmm8,%xmm1
+ 65,15,88,202, //addps %xmm10,%xmm1
+ 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10
+ 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
+ 243,68,15,16,88,24, //movss 0x18(%rax),%xmm11
+ 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
+ 243,68,15,16,96,40, //movss 0x28(%rax),%xmm12
+ 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
+ 243,68,15,16,104,56, //movss 0x38(%rax),%xmm13
+ 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
+ 243,68,15,16,112,72, //movss 0x48(%rax),%xmm14
+ 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14
+ 68,15,89,235, //mulps %xmm3,%xmm13
+ 69,15,88,238, //addps %xmm14,%xmm13
+ 68,15,89,226, //mulps %xmm2,%xmm12
+ 69,15,88,229, //addps %xmm13,%xmm12
+ 69,15,89,217, //mulps %xmm9,%xmm11
+ 69,15,88,220, //addps %xmm12,%xmm11
+ 69,15,89,208, //mulps %xmm8,%xmm10
+ 69,15,88,211, //addps %xmm11,%xmm10
+ 243,68,15,16,88,12, //movss 0xc(%rax),%xmm11
+ 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
+ 243,68,15,16,96,28, //movss 0x1c(%rax),%xmm12
+ 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
+ 243,68,15,16,104,44, //movss 0x2c(%rax),%xmm13
+ 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
+ 243,68,15,16,112,60, //movss 0x3c(%rax),%xmm14
+ 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14
+ 243,68,15,16,120,76, //movss 0x4c(%rax),%xmm15
+ 69,15,198,255,0, //shufps $0x0,%xmm15,%xmm15
+ 68,15,89,243, //mulps %xmm3,%xmm14
+ 69,15,88,247, //addps %xmm15,%xmm14
+ 68,15,89,234, //mulps %xmm2,%xmm13
+ 69,15,88,238, //addps %xmm14,%xmm13
+ 69,15,89,225, //mulps %xmm9,%xmm12
+ 69,15,88,229, //addps %xmm13,%xmm12
+ 69,15,89,216, //mulps %xmm8,%xmm11
+ 69,15,88,220, //addps %xmm12,%xmm11
+ 72,173, //lods %ds:(%rsi),%rax
+ 65,15,40,210, //movaps %xmm10,%xmm2
+ 65,15,40,219, //movaps %xmm11,%xmm3
+ 255,224, //jmpq *%rax
+};
+
CODE const uint8_t sk_matrix_perspective_sse41[] = {
68,15,40,192, //movaps %xmm0,%xmm8
72,173, //lods %ds:(%rsi),%rax
@@ -6475,6 +6857,25 @@ CODE const uint8_t sk_mirror_y_sse2[] = {
255,224, //jmpq *%rax
};
+CODE const uint8_t sk_luminance_to_alpha_sse2[] = {
+ 243,15,16,154,136,0,0,0, //movss 0x88(%rdx),%xmm3
+ 243,68,15,16,130,140,0,0,0, //movss 0x8c(%rdx),%xmm8
+ 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
+ 15,89,216, //mulps %xmm0,%xmm3
+ 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
+ 68,15,89,193, //mulps %xmm1,%xmm8
+ 68,15,88,195, //addps %xmm3,%xmm8
+ 243,15,16,154,144,0,0,0, //movss 0x90(%rdx),%xmm3
+ 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
+ 15,89,218, //mulps %xmm2,%xmm3
+ 65,15,88,216, //addps %xmm8,%xmm3
+ 72,173, //lods %ds:(%rsi),%rax
+ 15,87,192, //xorps %xmm0,%xmm0
+ 15,87,201, //xorps %xmm1,%xmm1
+ 15,87,210, //xorps %xmm2,%xmm2
+ 255,224, //jmpq *%rax
+};
+
CODE const uint8_t sk_matrix_2x3_sse2[] = {
68,15,40,201, //movaps %xmm1,%xmm9
68,15,40,192, //movaps %xmm0,%xmm8
@@ -6554,6 +6955,88 @@ CODE const uint8_t sk_matrix_3x4_sse2[] = {
255,224, //jmpq *%rax
};
+CODE const uint8_t sk_matrix_4x5_sse2[] = {
+ 68,15,40,201, //movaps %xmm1,%xmm9
+ 68,15,40,192, //movaps %xmm0,%xmm8
+ 72,173, //lods %ds:(%rsi),%rax
+ 243,15,16,0, //movss (%rax),%xmm0
+ 243,15,16,72,4, //movss 0x4(%rax),%xmm1
+ 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
+ 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10
+ 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
+ 243,68,15,16,88,32, //movss 0x20(%rax),%xmm11
+ 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
+ 243,68,15,16,96,48, //movss 0x30(%rax),%xmm12
+ 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
+ 243,68,15,16,104,64, //movss 0x40(%rax),%xmm13
+ 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
+ 68,15,89,227, //mulps %xmm3,%xmm12
+ 69,15,88,229, //addps %xmm13,%xmm12
+ 68,15,89,218, //mulps %xmm2,%xmm11
+ 69,15,88,220, //addps %xmm12,%xmm11
+ 69,15,89,209, //mulps %xmm9,%xmm10
+ 69,15,88,211, //addps %xmm11,%xmm10
+ 65,15,89,192, //mulps %xmm8,%xmm0
+ 65,15,88,194, //addps %xmm10,%xmm0
+ 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
+ 243,68,15,16,80,20, //movss 0x14(%rax),%xmm10
+ 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
+ 243,68,15,16,88,36, //movss 0x24(%rax),%xmm11
+ 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
+ 243,68,15,16,96,52, //movss 0x34(%rax),%xmm12
+ 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
+ 243,68,15,16,104,68, //movss 0x44(%rax),%xmm13
+ 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
+ 68,15,89,227, //mulps %xmm3,%xmm12
+ 69,15,88,229, //addps %xmm13,%xmm12
+ 68,15,89,218, //mulps %xmm2,%xmm11
+ 69,15,88,220, //addps %xmm12,%xmm11
+ 69,15,89,209, //mulps %xmm9,%xmm10
+ 69,15,88,211, //addps %xmm11,%xmm10
+ 65,15,89,200, //mulps %xmm8,%xmm1
+ 65,15,88,202, //addps %xmm10,%xmm1
+ 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10
+ 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
+ 243,68,15,16,88,24, //movss 0x18(%rax),%xmm11
+ 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
+ 243,68,15,16,96,40, //movss 0x28(%rax),%xmm12
+ 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
+ 243,68,15,16,104,56, //movss 0x38(%rax),%xmm13
+ 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
+ 243,68,15,16,112,72, //movss 0x48(%rax),%xmm14
+ 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14
+ 68,15,89,235, //mulps %xmm3,%xmm13
+ 69,15,88,238, //addps %xmm14,%xmm13
+ 68,15,89,226, //mulps %xmm2,%xmm12
+ 69,15,88,229, //addps %xmm13,%xmm12
+ 69,15,89,217, //mulps %xmm9,%xmm11
+ 69,15,88,220, //addps %xmm12,%xmm11
+ 69,15,89,208, //mulps %xmm8,%xmm10
+ 69,15,88,211, //addps %xmm11,%xmm10
+ 243,68,15,16,88,12, //movss 0xc(%rax),%xmm11
+ 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
+ 243,68,15,16,96,28, //movss 0x1c(%rax),%xmm12
+ 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
+ 243,68,15,16,104,44, //movss 0x2c(%rax),%xmm13
+ 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
+ 243,68,15,16,112,60, //movss 0x3c(%rax),%xmm14
+ 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14
+ 243,68,15,16,120,76, //movss 0x4c(%rax),%xmm15
+ 69,15,198,255,0, //shufps $0x0,%xmm15,%xmm15
+ 68,15,89,243, //mulps %xmm3,%xmm14
+ 69,15,88,247, //addps %xmm15,%xmm14
+ 68,15,89,234, //mulps %xmm2,%xmm13
+ 69,15,88,238, //addps %xmm14,%xmm13
+ 69,15,89,225, //mulps %xmm9,%xmm12
+ 69,15,88,229, //addps %xmm13,%xmm12
+ 69,15,89,216, //mulps %xmm8,%xmm11
+ 69,15,88,220, //addps %xmm12,%xmm11
+ 72,173, //lods %ds:(%rsi),%rax
+ 65,15,40,210, //movaps %xmm10,%xmm2
+ 65,15,40,219, //movaps %xmm11,%xmm3
+ 255,224, //jmpq *%rax
+};
+
CODE const uint8_t sk_matrix_perspective_sse2[] = {
68,15,40,192, //movaps %xmm0,%xmm8
72,173, //lods %ds:(%rsi),%rax
@@ -7713,6 +8196,20 @@ CODE const uint8_t sk_mirror_y_hsw[] = {
255,224, //jmpq *%rax
};
+CODE const uint8_t sk_luminance_to_alpha_hsw[] = {
+ 196,98,125,24,130,136,0,0,0, //vbroadcastss 0x88(%rdx),%ymm8
+ 196,226,125,24,154,140,0,0,0, //vbroadcastss 0x8c(%rdx),%ymm3
+ 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1
+ 196,98,125,168,193, //vfmadd213ps %ymm1,%ymm0,%ymm8
+ 196,226,125,24,154,144,0,0,0, //vbroadcastss 0x90(%rdx),%ymm3
+ 196,194,109,168,216, //vfmadd213ps %ymm8,%ymm2,%ymm3
+ 72,173, //lods %ds:(%rsi),%rax
+ 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
+ 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
+ 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
+ 255,224, //jmpq *%rax
+};
+
CODE const uint8_t sk_matrix_2x3_hsw[] = {
72,173, //lods %ds:(%rsi),%rax
196,98,125,24,8, //vbroadcastss (%rax),%ymm9
@@ -7761,6 +8258,52 @@ CODE const uint8_t sk_matrix_3x4_hsw[] = {
255,224, //jmpq *%rax
};
+CODE const uint8_t sk_matrix_4x5_hsw[] = {
+ 72,173, //lods %ds:(%rsi),%rax
+ 196,98,125,24,8, //vbroadcastss (%rax),%ymm9
+ 196,98,125,24,80,16, //vbroadcastss 0x10(%rax),%ymm10
+ 196,98,125,24,88,32, //vbroadcastss 0x20(%rax),%ymm11
+ 196,98,125,24,96,48, //vbroadcastss 0x30(%rax),%ymm12
+ 196,98,125,24,64,64, //vbroadcastss 0x40(%rax),%ymm8
+ 196,66,101,184,196, //vfmadd231ps %ymm12,%ymm3,%ymm8
+ 196,66,109,184,195, //vfmadd231ps %ymm11,%ymm2,%ymm8
+ 196,66,117,184,194, //vfmadd231ps %ymm10,%ymm1,%ymm8
+ 196,66,125,184,193, //vfmadd231ps %ymm9,%ymm0,%ymm8
+ 196,98,125,24,80,4, //vbroadcastss 0x4(%rax),%ymm10
+ 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11
+ 196,98,125,24,96,36, //vbroadcastss 0x24(%rax),%ymm12
+ 196,98,125,24,104,52, //vbroadcastss 0x34(%rax),%ymm13
+ 196,98,125,24,72,68, //vbroadcastss 0x44(%rax),%ymm9
+ 196,66,101,184,205, //vfmadd231ps %ymm13,%ymm3,%ymm9
+ 196,66,109,184,204, //vfmadd231ps %ymm12,%ymm2,%ymm9
+ 196,66,117,184,203, //vfmadd231ps %ymm11,%ymm1,%ymm9
+ 196,66,125,184,202, //vfmadd231ps %ymm10,%ymm0,%ymm9
+ 196,98,125,24,88,8, //vbroadcastss 0x8(%rax),%ymm11
+ 196,98,125,24,96,24, //vbroadcastss 0x18(%rax),%ymm12
+ 196,98,125,24,104,40, //vbroadcastss 0x28(%rax),%ymm13
+ 196,98,125,24,112,56, //vbroadcastss 0x38(%rax),%ymm14
+ 196,98,125,24,80,72, //vbroadcastss 0x48(%rax),%ymm10
+ 196,66,101,184,214, //vfmadd231ps %ymm14,%ymm3,%ymm10
+ 196,66,109,184,213, //vfmadd231ps %ymm13,%ymm2,%ymm10
+ 196,66,117,184,212, //vfmadd231ps %ymm12,%ymm1,%ymm10
+ 196,66,125,184,211, //vfmadd231ps %ymm11,%ymm0,%ymm10
+ 196,98,125,24,96,12, //vbroadcastss 0xc(%rax),%ymm12
+ 196,98,125,24,104,28, //vbroadcastss 0x1c(%rax),%ymm13
+ 196,98,125,24,112,44, //vbroadcastss 0x2c(%rax),%ymm14
+ 196,98,125,24,120,60, //vbroadcastss 0x3c(%rax),%ymm15
+ 196,98,125,24,88,76, //vbroadcastss 0x4c(%rax),%ymm11
+ 196,66,101,184,223, //vfmadd231ps %ymm15,%ymm3,%ymm11
+ 196,66,109,184,222, //vfmadd231ps %ymm14,%ymm2,%ymm11
+ 196,66,117,184,221, //vfmadd231ps %ymm13,%ymm1,%ymm11
+ 196,66,125,184,220, //vfmadd231ps %ymm12,%ymm0,%ymm11
+ 72,173, //lods %ds:(%rsi),%rax
+ 197,124,41,192, //vmovaps %ymm8,%ymm0
+ 197,124,41,201, //vmovaps %ymm9,%ymm1
+ 197,124,41,210, //vmovaps %ymm10,%ymm2
+ 197,124,41,219, //vmovaps %ymm11,%ymm3
+ 255,224, //jmpq *%rax
+};
+
CODE const uint8_t sk_matrix_perspective_hsw[] = {
72,173, //lods %ds:(%rsi),%rax
196,98,125,24,0, //vbroadcastss (%rax),%ymm8
@@ -9224,6 +9767,22 @@ CODE const uint8_t sk_mirror_y_avx[] = {
255,224, //jmpq *%rax
};
+CODE const uint8_t sk_luminance_to_alpha_avx[] = {
+ 196,226,125,24,154,136,0,0,0, //vbroadcastss 0x88(%rdx),%ymm3
+ 197,228,89,192, //vmulps %ymm0,%ymm3,%ymm0
+ 196,226,125,24,154,140,0,0,0, //vbroadcastss 0x8c(%rdx),%ymm3
+ 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1
+ 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0
+ 196,226,125,24,138,144,0,0,0, //vbroadcastss 0x90(%rdx),%ymm1
+ 197,244,89,202, //vmulps %ymm2,%ymm1,%ymm1
+ 197,252,88,217, //vaddps %ymm1,%ymm0,%ymm3
+ 72,173, //lods %ds:(%rsi),%rax
+ 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0
+ 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1
+ 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2
+ 255,224, //jmpq *%rax
+};
+
CODE const uint8_t sk_matrix_2x3_avx[] = {
72,173, //lods %ds:(%rsi),%rax
196,98,125,24,0, //vbroadcastss (%rax),%ymm8
@@ -9283,6 +9842,67 @@ CODE const uint8_t sk_matrix_3x4_avx[] = {
255,224, //jmpq *%rax
};
+CODE const uint8_t sk_matrix_4x5_avx[] = {
+ 72,173, //lods %ds:(%rsi),%rax
+ 196,98,125,24,0, //vbroadcastss (%rax),%ymm8
+ 196,98,125,24,72,16, //vbroadcastss 0x10(%rax),%ymm9
+ 196,98,125,24,80,32, //vbroadcastss 0x20(%rax),%ymm10
+ 196,98,125,24,88,48, //vbroadcastss 0x30(%rax),%ymm11
+ 196,98,125,24,96,64, //vbroadcastss 0x40(%rax),%ymm12
+ 197,36,89,219, //vmulps %ymm3,%ymm11,%ymm11
+ 196,65,36,88,220, //vaddps %ymm12,%ymm11,%ymm11
+ 197,44,89,210, //vmulps %ymm2,%ymm10,%ymm10
+ 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10
+ 197,52,89,201, //vmulps %ymm1,%ymm9,%ymm9
+ 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9
+ 197,60,89,192, //vmulps %ymm0,%ymm8,%ymm8
+ 196,65,60,88,193, //vaddps %ymm9,%ymm8,%ymm8
+ 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9
+ 196,98,125,24,80,20, //vbroadcastss 0x14(%rax),%ymm10
+ 196,98,125,24,88,36, //vbroadcastss 0x24(%rax),%ymm11
+ 196,98,125,24,96,52, //vbroadcastss 0x34(%rax),%ymm12
+ 196,98,125,24,104,68, //vbroadcastss 0x44(%rax),%ymm13
+ 197,28,89,227, //vmulps %ymm3,%ymm12,%ymm12
+ 196,65,28,88,229, //vaddps %ymm13,%ymm12,%ymm12
+ 197,36,89,218, //vmulps %ymm2,%ymm11,%ymm11
+ 196,65,36,88,220, //vaddps %ymm12,%ymm11,%ymm11
+ 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10
+ 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10
+ 197,52,89,200, //vmulps %ymm0,%ymm9,%ymm9
+ 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9
+ 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10
+ 196,98,125,24,88,24, //vbroadcastss 0x18(%rax),%ymm11
+ 196,98,125,24,96,40, //vbroadcastss 0x28(%rax),%ymm12
+ 196,98,125,24,104,56, //vbroadcastss 0x38(%rax),%ymm13
+ 196,98,125,24,112,72, //vbroadcastss 0x48(%rax),%ymm14
+ 197,20,89,235, //vmulps %ymm3,%ymm13,%ymm13
+ 196,65,20,88,238, //vaddps %ymm14,%ymm13,%ymm13
+ 197,28,89,226, //vmulps %ymm2,%ymm12,%ymm12
+ 196,65,28,88,229, //vaddps %ymm13,%ymm12,%ymm12
+ 197,36,89,217, //vmulps %ymm1,%ymm11,%ymm11
+ 196,65,36,88,220, //vaddps %ymm12,%ymm11,%ymm11
+ 197,44,89,208, //vmulps %ymm0,%ymm10,%ymm10
+ 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10
+ 196,98,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm11
+ 196,98,125,24,96,28, //vbroadcastss 0x1c(%rax),%ymm12
+ 196,98,125,24,104,44, //vbroadcastss 0x2c(%rax),%ymm13
+ 196,98,125,24,112,60, //vbroadcastss 0x3c(%rax),%ymm14
+ 196,98,125,24,120,76, //vbroadcastss 0x4c(%rax),%ymm15
+ 197,140,89,219, //vmulps %ymm3,%ymm14,%ymm3
+ 196,193,100,88,223, //vaddps %ymm15,%ymm3,%ymm3
+ 197,148,89,210, //vmulps %ymm2,%ymm13,%ymm2
+ 197,236,88,211, //vaddps %ymm3,%ymm2,%ymm2
+ 197,156,89,201, //vmulps %ymm1,%ymm12,%ymm1
+ 197,244,88,202, //vaddps %ymm2,%ymm1,%ymm1
+ 197,164,89,192, //vmulps %ymm0,%ymm11,%ymm0
+ 197,252,88,217, //vaddps %ymm1,%ymm0,%ymm3
+ 72,173, //lods %ds:(%rsi),%rax
+ 197,124,41,192, //vmovaps %ymm8,%ymm0
+ 197,124,41,201, //vmovaps %ymm9,%ymm1
+ 197,124,41,210, //vmovaps %ymm10,%ymm2
+ 255,224, //jmpq *%rax
+};
+
CODE const uint8_t sk_matrix_perspective_avx[] = {
72,173, //lods %ds:(%rsi),%rax
196,98,125,24,0, //vbroadcastss (%rax),%ymm8
@@ -10247,6 +10867,25 @@ CODE const uint8_t sk_mirror_y_sse41[] = {
255,224, //jmpq *%rax
};
+CODE const uint8_t sk_luminance_to_alpha_sse41[] = {
+ 243,15,16,154,136,0,0,0, //movss 0x88(%rdx),%xmm3
+ 243,68,15,16,130,140,0,0,0, //movss 0x8c(%rdx),%xmm8
+ 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
+ 15,89,216, //mulps %xmm0,%xmm3
+ 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
+ 68,15,89,193, //mulps %xmm1,%xmm8
+ 68,15,88,195, //addps %xmm3,%xmm8
+ 243,15,16,154,144,0,0,0, //movss 0x90(%rdx),%xmm3
+ 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
+ 15,89,218, //mulps %xmm2,%xmm3
+ 65,15,88,216, //addps %xmm8,%xmm3
+ 72,173, //lods %ds:(%rsi),%rax
+ 15,87,192, //xorps %xmm0,%xmm0
+ 15,87,201, //xorps %xmm1,%xmm1
+ 15,87,210, //xorps %xmm2,%xmm2
+ 255,224, //jmpq *%rax
+};
+
CODE const uint8_t sk_matrix_2x3_sse41[] = {
68,15,40,201, //movaps %xmm1,%xmm9
68,15,40,192, //movaps %xmm0,%xmm8
@@ -10326,6 +10965,88 @@ CODE const uint8_t sk_matrix_3x4_sse41[] = {
255,224, //jmpq *%rax
};
+CODE const uint8_t sk_matrix_4x5_sse41[] = {
+ 68,15,40,201, //movaps %xmm1,%xmm9
+ 68,15,40,192, //movaps %xmm0,%xmm8
+ 72,173, //lods %ds:(%rsi),%rax
+ 243,15,16,0, //movss (%rax),%xmm0
+ 243,15,16,72,4, //movss 0x4(%rax),%xmm1
+ 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
+ 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10
+ 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
+ 243,68,15,16,88,32, //movss 0x20(%rax),%xmm11
+ 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
+ 243,68,15,16,96,48, //movss 0x30(%rax),%xmm12
+ 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
+ 243,68,15,16,104,64, //movss 0x40(%rax),%xmm13
+ 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
+ 68,15,89,227, //mulps %xmm3,%xmm12
+ 69,15,88,229, //addps %xmm13,%xmm12
+ 68,15,89,218, //mulps %xmm2,%xmm11
+ 69,15,88,220, //addps %xmm12,%xmm11
+ 69,15,89,209, //mulps %xmm9,%xmm10
+ 69,15,88,211, //addps %xmm11,%xmm10
+ 65,15,89,192, //mulps %xmm8,%xmm0
+ 65,15,88,194, //addps %xmm10,%xmm0
+ 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
+ 243,68,15,16,80,20, //movss 0x14(%rax),%xmm10
+ 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
+ 243,68,15,16,88,36, //movss 0x24(%rax),%xmm11
+ 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
+ 243,68,15,16,96,52, //movss 0x34(%rax),%xmm12
+ 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
+ 243,68,15,16,104,68, //movss 0x44(%rax),%xmm13
+ 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
+ 68,15,89,227, //mulps %xmm3,%xmm12
+ 69,15,88,229, //addps %xmm13,%xmm12
+ 68,15,89,218, //mulps %xmm2,%xmm11
+ 69,15,88,220, //addps %xmm12,%xmm11
+ 69,15,89,209, //mulps %xmm9,%xmm10
+ 69,15,88,211, //addps %xmm11,%xmm10
+ 65,15,89,200, //mulps %xmm8,%xmm1
+ 65,15,88,202, //addps %xmm10,%xmm1
+ 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10
+ 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
+ 243,68,15,16,88,24, //movss 0x18(%rax),%xmm11
+ 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
+ 243,68,15,16,96,40, //movss 0x28(%rax),%xmm12
+ 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
+ 243,68,15,16,104,56, //movss 0x38(%rax),%xmm13
+ 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
+ 243,68,15,16,112,72, //movss 0x48(%rax),%xmm14
+ 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14
+ 68,15,89,235, //mulps %xmm3,%xmm13
+ 69,15,88,238, //addps %xmm14,%xmm13
+ 68,15,89,226, //mulps %xmm2,%xmm12
+ 69,15,88,229, //addps %xmm13,%xmm12
+ 69,15,89,217, //mulps %xmm9,%xmm11
+ 69,15,88,220, //addps %xmm12,%xmm11
+ 69,15,89,208, //mulps %xmm8,%xmm10
+ 69,15,88,211, //addps %xmm11,%xmm10
+ 243,68,15,16,88,12, //movss 0xc(%rax),%xmm11
+ 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
+ 243,68,15,16,96,28, //movss 0x1c(%rax),%xmm12
+ 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
+ 243,68,15,16,104,44, //movss 0x2c(%rax),%xmm13
+ 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
+ 243,68,15,16,112,60, //movss 0x3c(%rax),%xmm14
+ 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14
+ 243,68,15,16,120,76, //movss 0x4c(%rax),%xmm15
+ 69,15,198,255,0, //shufps $0x0,%xmm15,%xmm15
+ 68,15,89,243, //mulps %xmm3,%xmm14
+ 69,15,88,247, //addps %xmm15,%xmm14
+ 68,15,89,234, //mulps %xmm2,%xmm13
+ 69,15,88,238, //addps %xmm14,%xmm13
+ 69,15,89,225, //mulps %xmm9,%xmm12
+ 69,15,88,229, //addps %xmm13,%xmm12
+ 69,15,89,216, //mulps %xmm8,%xmm11
+ 69,15,88,220, //addps %xmm12,%xmm11
+ 72,173, //lods %ds:(%rsi),%rax
+ 65,15,40,210, //movaps %xmm10,%xmm2
+ 65,15,40,219, //movaps %xmm11,%xmm3
+ 255,224, //jmpq *%rax
+};
+
CODE const uint8_t sk_matrix_perspective_sse41[] = {
68,15,40,192, //movaps %xmm0,%xmm8
72,173, //lods %ds:(%rsi),%rax
@@ -11358,6 +12079,25 @@ CODE const uint8_t sk_mirror_y_sse2[] = {
255,224, //jmpq *%rax
};
+CODE const uint8_t sk_luminance_to_alpha_sse2[] = {
+ 243,15,16,154,136,0,0,0, //movss 0x88(%rdx),%xmm3
+ 243,68,15,16,130,140,0,0,0, //movss 0x8c(%rdx),%xmm8
+ 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
+ 15,89,216, //mulps %xmm0,%xmm3
+ 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8
+ 68,15,89,193, //mulps %xmm1,%xmm8
+ 68,15,88,195, //addps %xmm3,%xmm8
+ 243,15,16,154,144,0,0,0, //movss 0x90(%rdx),%xmm3
+ 15,198,219,0, //shufps $0x0,%xmm3,%xmm3
+ 15,89,218, //mulps %xmm2,%xmm3
+ 65,15,88,216, //addps %xmm8,%xmm3
+ 72,173, //lods %ds:(%rsi),%rax
+ 15,87,192, //xorps %xmm0,%xmm0
+ 15,87,201, //xorps %xmm1,%xmm1
+ 15,87,210, //xorps %xmm2,%xmm2
+ 255,224, //jmpq *%rax
+};
+
CODE const uint8_t sk_matrix_2x3_sse2[] = {
68,15,40,201, //movaps %xmm1,%xmm9
68,15,40,192, //movaps %xmm0,%xmm8
@@ -11437,6 +12177,88 @@ CODE const uint8_t sk_matrix_3x4_sse2[] = {
255,224, //jmpq *%rax
};
+CODE const uint8_t sk_matrix_4x5_sse2[] = {
+ 68,15,40,201, //movaps %xmm1,%xmm9
+ 68,15,40,192, //movaps %xmm0,%xmm8
+ 72,173, //lods %ds:(%rsi),%rax
+ 243,15,16,0, //movss (%rax),%xmm0
+ 243,15,16,72,4, //movss 0x4(%rax),%xmm1
+ 15,198,192,0, //shufps $0x0,%xmm0,%xmm0
+ 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10
+ 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
+ 243,68,15,16,88,32, //movss 0x20(%rax),%xmm11
+ 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
+ 243,68,15,16,96,48, //movss 0x30(%rax),%xmm12
+ 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
+ 243,68,15,16,104,64, //movss 0x40(%rax),%xmm13
+ 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
+ 68,15,89,227, //mulps %xmm3,%xmm12
+ 69,15,88,229, //addps %xmm13,%xmm12
+ 68,15,89,218, //mulps %xmm2,%xmm11
+ 69,15,88,220, //addps %xmm12,%xmm11
+ 69,15,89,209, //mulps %xmm9,%xmm10
+ 69,15,88,211, //addps %xmm11,%xmm10
+ 65,15,89,192, //mulps %xmm8,%xmm0
+ 65,15,88,194, //addps %xmm10,%xmm0
+ 15,198,201,0, //shufps $0x0,%xmm1,%xmm1
+ 243,68,15,16,80,20, //movss 0x14(%rax),%xmm10
+ 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
+ 243,68,15,16,88,36, //movss 0x24(%rax),%xmm11
+ 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
+ 243,68,15,16,96,52, //movss 0x34(%rax),%xmm12
+ 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
+ 243,68,15,16,104,68, //movss 0x44(%rax),%xmm13
+ 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
+ 68,15,89,227, //mulps %xmm3,%xmm12
+ 69,15,88,229, //addps %xmm13,%xmm12
+ 68,15,89,218, //mulps %xmm2,%xmm11
+ 69,15,88,220, //addps %xmm12,%xmm11
+ 69,15,89,209, //mulps %xmm9,%xmm10
+ 69,15,88,211, //addps %xmm11,%xmm10
+ 65,15,89,200, //mulps %xmm8,%xmm1
+ 65,15,88,202, //addps %xmm10,%xmm1
+ 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10
+ 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10
+ 243,68,15,16,88,24, //movss 0x18(%rax),%xmm11
+ 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
+ 243,68,15,16,96,40, //movss 0x28(%rax),%xmm12
+ 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
+ 243,68,15,16,104,56, //movss 0x38(%rax),%xmm13
+ 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
+ 243,68,15,16,112,72, //movss 0x48(%rax),%xmm14
+ 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14
+ 68,15,89,235, //mulps %xmm3,%xmm13
+ 69,15,88,238, //addps %xmm14,%xmm13
+ 68,15,89,226, //mulps %xmm2,%xmm12
+ 69,15,88,229, //addps %xmm13,%xmm12
+ 69,15,89,217, //mulps %xmm9,%xmm11
+ 69,15,88,220, //addps %xmm12,%xmm11
+ 69,15,89,208, //mulps %xmm8,%xmm10
+ 69,15,88,211, //addps %xmm11,%xmm10
+ 243,68,15,16,88,12, //movss 0xc(%rax),%xmm11
+ 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11
+ 243,68,15,16,96,28, //movss 0x1c(%rax),%xmm12
+ 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12
+ 243,68,15,16,104,44, //movss 0x2c(%rax),%xmm13
+ 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13
+ 243,68,15,16,112,60, //movss 0x3c(%rax),%xmm14
+ 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14
+ 243,68,15,16,120,76, //movss 0x4c(%rax),%xmm15
+ 69,15,198,255,0, //shufps $0x0,%xmm15,%xmm15
+ 68,15,89,243, //mulps %xmm3,%xmm14
+ 69,15,88,247, //addps %xmm15,%xmm14
+ 68,15,89,234, //mulps %xmm2,%xmm13
+ 69,15,88,238, //addps %xmm14,%xmm13
+ 69,15,89,225, //mulps %xmm9,%xmm12
+ 69,15,88,229, //addps %xmm13,%xmm12
+ 69,15,89,216, //mulps %xmm8,%xmm11
+ 69,15,88,220, //addps %xmm12,%xmm11
+ 72,173, //lods %ds:(%rsi),%rax
+ 65,15,40,210, //movaps %xmm10,%xmm2
+ 65,15,40,219, //movaps %xmm11,%xmm3
+ 255,224, //jmpq *%rax
+};
+
CODE const uint8_t sk_matrix_perspective_sse2[] = {
68,15,40,192, //movaps %xmm0,%xmm8
72,173, //lods %ds:(%rsi),%rax
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index 0147408632..c22b89eac8 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -1073,6 +1073,11 @@ STAGE(repeat_y) { g = repeat(g, *(const float*)ctx, k); }
STAGE(mirror_x) { r = mirror(r, *(const float*)ctx, k); }
STAGE(mirror_y) { g = mirror(g, *(const float*)ctx, k); }
+STAGE(luminance_to_alpha) {
+ a = r*k->lum_r + g*k->lum_g + b*k->lum_b;
+ r = g = b = 0;
+}
+
STAGE(matrix_2x3) {
auto m = (const float*)ctx;
@@ -1091,6 +1096,18 @@ STAGE(matrix_3x4) {
g = G;
b = B;
}
+STAGE(matrix_4x5) {
+ auto m = (const float*)ctx;
+
+ auto R = mad(r,m[0], mad(g,m[4], mad(b,m[ 8], mad(a,m[12], m[16])))),
+ G = mad(r,m[1], mad(g,m[5], mad(b,m[ 9], mad(a,m[13], m[17])))),
+ B = mad(r,m[2], mad(g,m[6], mad(b,m[10], mad(a,m[14], m[18])))),
+ A = mad(r,m[3], mad(g,m[7], mad(b,m[11], mad(a,m[15], m[19]))));
+ r = R;
+ g = G;
+ b = B;
+ a = A;
+}
STAGE(matrix_perspective) {
// N.B. Unlike the other matrix_ stages, this matrix is row-major.
auto m = (const float*)ctx;