From f04ff7696d34d810a94b7fd98aa0006955f57fc0 Mon Sep 17 00:00:00 2001 From: Mike Klein Date: Fri, 20 Oct 2017 15:50:12 -0400 Subject: translate+scale -> scale+translate This is a no-op refactor. It's just always surprised me that the matrix_scale_translate stage expects [tx ty sx sy], when scales precede the translates in the names and in both normal row-major and column-major matrix layouts. This switches to [sx sy tx ty], scale then translate. Change-Id: I2d88701121ae8013facd5a28bb0ff520211db5a6 Reviewed-on: https://skia-review.googlesource.com/62541 Reviewed-by: Mike Reed Commit-Queue: Mike Klein --- src/jumper/SkJumper_generated.S | 92 +++++++++++++++++++------------------ src/jumper/SkJumper_generated_win.S | 64 +++++++++++++------------- src/jumper/SkJumper_stages.cpp | 4 +- 3 files changed, 81 insertions(+), 79 deletions(-) (limited to 'src/jumper') diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S index 42a5c717ab..c4292fbec9 100644 --- a/src/jumper/SkJumper_generated.S +++ b/src/jumper/SkJumper_generated.S @@ -6565,14 +6565,15 @@ FUNCTION(_sk_matrix_scale_translate_vfp4) _sk_matrix_scale_translate_vfp4: .long 0xe92d4800 // push {fp, lr} .long 0xe8911008 // ldm r1, {r3, ip} + .long 0xe3a0200c // mov r2, #12 .long 0xe2811008 // add r1, r1, #8 - .long 0xe2832008 // add r2, r3, #8 .long 0xe1a0e003 // mov lr, r3 - .long 0xf4e22cbf // vld1.32 {d18[]-d19[]}, [r2 :32] - .long 0xe283200c // add r2, r3, #12 - .long 0xf4ee0cbd // vld1.32 {d16[]-d17[]}, [lr :32]! - .long 0xf2400c72 // vfma.f32 q8, q0, q9 + .long 0xf4ee2cb2 // vld1.32 {d18[]-d19[]}, [lr :32], r2 + .long 0xe2832004 // add r2, r3, #4 .long 0xf4e24cbf // vld1.32 {d20[]-d21[]}, [r2 :32] + .long 0xe2832008 // add r2, r3, #8 + .long 0xf4e20cbf // vld1.32 {d16[]-d17[]}, [r2 :32] + .long 0xf2400c72 // vfma.f32 q8, q0, q9 .long 0xf4ee2cbf // vld1.32 {d18[]-d19[]}, [lr :32] .long 0xf2422c74 // vfma.f32 q9, q1, q10 .long 0xf22001f0 // vorr q0, q8, q8 @@ -6915,6 +6916,7 @@ _sk_evenly_spaced_gradient_vfp4: .long 0xe28dd004 // add sp, sp, #4 .long 0xe8bd4ff0 // pop {r4, r5, r6, r7, r8, r9, sl, fp, lr} .long 0xe12fff12 // bx r2 + .long 0xe320f000 // nop {0} HIDDEN _sk_gauss_a_to_rgba_vfp4 .globl _sk_gauss_a_to_rgba_vfp4 @@ -6974,7 +6976,7 @@ _sk_gradient_vfp4: .long 0xf2c00050 // vmov.i32 q8, #0 .long 0xe5923000 // ldr r3, [r2] .long 0xe3530002 // cmp r3, #2 - .long 0x3a00000a // bcc 62ec + .long 0x3a00000a // bcc 62f4 .long 0xe5927024 // ldr r7, [r2, #36] .long 0xf2c04051 // vmov.i32 q10, #1 .long 0xf2c00050 // vmov.i32 q8, #0 @@ -6985,7 +6987,7 @@ _sk_gradient_vfp4: .long 0xf3468ee8 // vcge.f32 q12, q11, q12 .long 0xf35481f2 // vbsl q12, q10, q9 .long 0xf26008e8 // vadd.i32 q8, q8, q12 - .long 0x1afffff9 // bne 62d4 + .long 0x1afffff9 // bne 62dc .long 0xee314b90 // vmov.32 r4, d17[1] .long 0xe5926010 // ldr r6, [r2, #16] .long 0xee11cb90 // vmov.32 ip, d17[0] @@ -8129,7 +8131,7 @@ _sk_clut_3D_vfp4: .long 0xe0835105 // add r5, r3, r5, lsl #2 .long 0xedd55a00 // vldr s11, [r5] .long 0xee325b90 // vmov.32 r5, d18[1] - .long 0xea000004 // b 7300 + .long 0xea000004 // b 7308 .long 0xe320f000 // nop {0} .long 0x3f7ff972 // .word 0x3f7ff972 .long 0x3f7ff972 // .word 0x3f7ff972 @@ -8646,7 +8648,7 @@ _sk_clut_4D_vfp4: .long 0xf2802051 // vmov.i32 q1, #1 .long 0xf22e29e0 // vmla.i32 q1, q15, q8 .long 0xedd20a00 // vldr s1, [r2] - .long 0xea000004 // b 7b00 + .long 0xea000004 // b 7b08 .long 0xe320f000 // nop {0} .long 0x3f7ff972 // .word 0x3f7ff972 .long 0x3f7ff972 // .word 0x3f7ff972 @@ -14995,10 +14997,10 @@ HIDDEN _sk_matrix_scale_translate_skx FUNCTION(_sk_matrix_scale_translate_skx) _sk_matrix_scale_translate_skx: .byte 72,173 // lods %ds:(%rsi),%rax - .byte 196,98,125,24,64,8 // vbroadcastss 0x8(%rax),%ymm8 - .byte 98,242,61,56,168,0 // vfmadd213ps (%rax){1to8},%ymm8,%ymm0 - .byte 196,98,125,24,64,12 // vbroadcastss 0xc(%rax),%ymm8 - .byte 98,242,61,56,168,72,1 // vfmadd213ps 0x4(%rax){1to8},%ymm8,%ymm1 + .byte 196,98,125,24,0 // vbroadcastss (%rax),%ymm8 + .byte 98,242,61,56,168,64,2 // vfmadd213ps 0x8(%rax){1to8},%ymm8,%ymm0 + .byte 196,98,125,24,64,4 // vbroadcastss 0x4(%rax),%ymm8 + .byte 98,242,61,56,168,72,3 // vfmadd213ps 0xc(%rax){1to8},%ymm8,%ymm1 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -22180,11 +22182,11 @@ HIDDEN _sk_matrix_scale_translate_hsw FUNCTION(_sk_matrix_scale_translate_hsw) _sk_matrix_scale_translate_hsw: .byte 72,173 // lods %ds:(%rsi),%rax - .byte 196,98,125,24,64,8 // vbroadcastss 0x8(%rax),%ymm8 - .byte 196,98,125,24,8 // vbroadcastss (%rax),%ymm9 + .byte 196,98,125,24,0 // vbroadcastss (%rax),%ymm8 + .byte 196,98,125,24,72,8 // vbroadcastss 0x8(%rax),%ymm9 .byte 196,194,61,168,193 // vfmadd213ps %ymm9,%ymm8,%ymm0 - .byte 196,98,125,24,64,12 // vbroadcastss 0xc(%rax),%ymm8 - .byte 196,98,125,24,72,4 // vbroadcastss 0x4(%rax),%ymm9 + .byte 196,98,125,24,64,4 // vbroadcastss 0x4(%rax),%ymm8 + .byte 196,98,125,24,72,12 // vbroadcastss 0xc(%rax),%ymm9 .byte 196,194,61,168,201 // vfmadd213ps %ymm9,%ymm8,%ymm1 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -30782,12 +30784,12 @@ HIDDEN _sk_matrix_scale_translate_avx FUNCTION(_sk_matrix_scale_translate_avx) _sk_matrix_scale_translate_avx: .byte 72,173 // lods %ds:(%rsi),%rax - .byte 196,98,125,24,64,8 // vbroadcastss 0x8(%rax),%ymm8 - .byte 196,98,125,24,8 // vbroadcastss (%rax),%ymm9 + .byte 196,98,125,24,0 // vbroadcastss (%rax),%ymm8 + .byte 196,98,125,24,72,8 // vbroadcastss 0x8(%rax),%ymm9 .byte 197,188,89,192 // vmulps %ymm0,%ymm8,%ymm0 .byte 196,193,124,88,193 // vaddps %ymm9,%ymm0,%ymm0 - .byte 196,98,125,24,64,12 // vbroadcastss 0xc(%rax),%ymm8 - .byte 196,98,125,24,72,4 // vbroadcastss 0x4(%rax),%ymm9 + .byte 196,98,125,24,64,4 // vbroadcastss 0x4(%rax),%ymm8 + .byte 196,98,125,24,72,12 // vbroadcastss 0xc(%rax),%ymm9 .byte 197,188,89,201 // vmulps %ymm1,%ymm8,%ymm1 .byte 196,193,116,88,201 // vaddps %ymm9,%ymm1,%ymm1 .byte 72,173 // lods %ds:(%rsi),%rax @@ -39808,18 +39810,18 @@ HIDDEN _sk_matrix_scale_translate_sse41 FUNCTION(_sk_matrix_scale_translate_sse41) _sk_matrix_scale_translate_sse41: .byte 72,173 // lods %ds:(%rsi),%rax - .byte 243,68,15,16,64,8 // movss 0x8(%rax),%xmm8 + .byte 243,68,15,16,0 // movss (%rax),%xmm8 + .byte 243,68,15,16,72,4 // movss 0x4(%rax),%xmm9 .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8 - .byte 243,68,15,16,8 // movss (%rax),%xmm9 - .byte 243,68,15,16,80,4 // movss 0x4(%rax),%xmm10 - .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9 + .byte 243,68,15,16,80,8 // movss 0x8(%rax),%xmm10 + .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10 .byte 65,15,89,192 // mulps %xmm8,%xmm0 - .byte 65,15,88,193 // addps %xmm9,%xmm0 + .byte 65,15,88,194 // addps %xmm10,%xmm0 + .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9 .byte 243,68,15,16,64,12 // movss 0xc(%rax),%xmm8 .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8 - .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10 - .byte 65,15,89,200 // mulps %xmm8,%xmm1 - .byte 65,15,88,202 // addps %xmm10,%xmm1 + .byte 65,15,89,201 // mulps %xmm9,%xmm1 + .byte 65,15,88,200 // addps %xmm8,%xmm1 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -48111,18 +48113,18 @@ HIDDEN _sk_matrix_scale_translate_sse2 FUNCTION(_sk_matrix_scale_translate_sse2) _sk_matrix_scale_translate_sse2: .byte 72,173 // lods %ds:(%rsi),%rax - .byte 243,68,15,16,64,8 // movss 0x8(%rax),%xmm8 + .byte 243,68,15,16,0 // movss (%rax),%xmm8 + .byte 243,68,15,16,72,4 // movss 0x4(%rax),%xmm9 .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8 - .byte 243,68,15,16,8 // movss (%rax),%xmm9 - .byte 243,68,15,16,80,4 // movss 0x4(%rax),%xmm10 - .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9 + .byte 243,68,15,16,80,8 // movss 0x8(%rax),%xmm10 + .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10 .byte 65,15,89,192 // mulps %xmm8,%xmm0 - .byte 65,15,88,193 // addps %xmm9,%xmm0 + .byte 65,15,88,194 // addps %xmm10,%xmm0 + .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9 .byte 243,68,15,16,64,12 // movss 0xc(%rax),%xmm8 .byte 69,15,198,192,0 // shufps $0x0,%xmm8,%xmm8 - .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10 - .byte 65,15,89,200 // mulps %xmm8,%xmm1 - .byte 65,15,88,202 // addps %xmm10,%xmm1 + .byte 65,15,89,201 // mulps %xmm9,%xmm1 + .byte 65,15,88,200 // addps %xmm8,%xmm1 .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -69804,18 +69806,18 @@ _sk_matrix_scale_translate_sse2: .byte 131,236,8 // sub $0x8,%esp .byte 139,69,12 // mov 0xc(%ebp),%eax .byte 139,8 // mov (%eax),%ecx - .byte 243,15,16,97,8 // movss 0x8(%ecx),%xmm4 + .byte 243,15,16,33 // movss (%ecx),%xmm4 + .byte 243,15,16,105,4 // movss 0x4(%ecx),%xmm5 .byte 15,198,228,0 // shufps $0x0,%xmm4,%xmm4 - .byte 243,15,16,41 // movss (%ecx),%xmm5 - .byte 243,15,16,113,4 // movss 0x4(%ecx),%xmm6 - .byte 15,198,237,0 // shufps $0x0,%xmm5,%xmm5 + .byte 243,15,16,113,8 // movss 0x8(%ecx),%xmm6 + .byte 15,198,246,0 // shufps $0x0,%xmm6,%xmm6 .byte 15,89,196 // mulps %xmm4,%xmm0 - .byte 15,88,197 // addps %xmm5,%xmm0 + .byte 15,88,198 // addps %xmm6,%xmm0 + .byte 15,198,237,0 // shufps $0x0,%xmm5,%xmm5 .byte 243,15,16,97,12 // movss 0xc(%ecx),%xmm4 .byte 15,198,228,0 // shufps $0x0,%xmm4,%xmm4 - .byte 15,198,246,0 // shufps $0x0,%xmm6,%xmm6 - .byte 15,89,204 // mulps %xmm4,%xmm1 - .byte 15,88,206 // addps %xmm6,%xmm1 + .byte 15,89,205 // mulps %xmm5,%xmm1 + .byte 15,88,204 // addps %xmm4,%xmm1 .byte 141,72,8 // lea 0x8(%eax),%ecx .byte 131,236,8 // sub $0x8,%esp .byte 81 // push %ecx diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S index 6325b10066..f9edf539d2 100644 --- a/src/jumper/SkJumper_generated_win.S +++ b/src/jumper/SkJumper_generated_win.S @@ -5480,11 +5480,11 @@ _sk_matrix_translate_hsw LABEL PROC PUBLIC _sk_matrix_scale_translate_hsw _sk_matrix_scale_translate_hsw LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax - DB 196,98,125,24,64,8 ; vbroadcastss 0x8(%rax),%ymm8 - DB 196,98,125,24,8 ; vbroadcastss (%rax),%ymm9 + DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8 + DB 196,98,125,24,72,8 ; vbroadcastss 0x8(%rax),%ymm9 DB 196,194,61,168,193 ; vfmadd213ps %ymm9,%ymm8,%ymm0 - DB 196,98,125,24,64,12 ; vbroadcastss 0xc(%rax),%ymm8 - DB 196,98,125,24,72,4 ; vbroadcastss 0x4(%rax),%ymm9 + DB 196,98,125,24,64,4 ; vbroadcastss 0x4(%rax),%ymm8 + DB 196,98,125,24,72,12 ; vbroadcastss 0xc(%rax),%ymm9 DB 196,194,61,168,201 ; vfmadd213ps %ymm9,%ymm8,%ymm1 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax @@ -13816,12 +13816,12 @@ _sk_matrix_translate_avx LABEL PROC PUBLIC _sk_matrix_scale_translate_avx _sk_matrix_scale_translate_avx LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax - DB 196,98,125,24,64,8 ; vbroadcastss 0x8(%rax),%ymm8 - DB 196,98,125,24,8 ; vbroadcastss (%rax),%ymm9 + DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8 + DB 196,98,125,24,72,8 ; vbroadcastss 0x8(%rax),%ymm9 DB 197,188,89,192 ; vmulps %ymm0,%ymm8,%ymm0 DB 196,193,124,88,193 ; vaddps %ymm9,%ymm0,%ymm0 - DB 196,98,125,24,64,12 ; vbroadcastss 0xc(%rax),%ymm8 - DB 196,98,125,24,72,4 ; vbroadcastss 0x4(%rax),%ymm9 + DB 196,98,125,24,64,4 ; vbroadcastss 0x4(%rax),%ymm8 + DB 196,98,125,24,72,12 ; vbroadcastss 0xc(%rax),%ymm9 DB 197,188,89,201 ; vmulps %ymm1,%ymm8,%ymm1 DB 196,193,116,88,201 ; vaddps %ymm9,%ymm1,%ymm1 DB 72,173 ; lods %ds:(%rsi),%rax @@ -22580,18 +22580,18 @@ _sk_matrix_translate_sse41 LABEL PROC PUBLIC _sk_matrix_scale_translate_sse41 _sk_matrix_scale_translate_sse41 LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax - DB 243,68,15,16,64,8 ; movss 0x8(%rax),%xmm8 + DB 243,68,15,16,0 ; movss (%rax),%xmm8 + DB 243,68,15,16,72,4 ; movss 0x4(%rax),%xmm9 DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8 - DB 243,68,15,16,8 ; movss (%rax),%xmm9 - DB 243,68,15,16,80,4 ; movss 0x4(%rax),%xmm10 - DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9 + DB 243,68,15,16,80,8 ; movss 0x8(%rax),%xmm10 + DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10 DB 65,15,89,192 ; mulps %xmm8,%xmm0 - DB 65,15,88,193 ; addps %xmm9,%xmm0 + DB 65,15,88,194 ; addps %xmm10,%xmm0 + DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9 DB 243,68,15,16,64,12 ; movss 0xc(%rax),%xmm8 DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8 - DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10 - DB 65,15,89,200 ; mulps %xmm8,%xmm1 - DB 65,15,88,202 ; addps %xmm10,%xmm1 + DB 65,15,89,201 ; mulps %xmm9,%xmm1 + DB 65,15,88,200 ; addps %xmm8,%xmm1 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax @@ -30613,18 +30613,18 @@ _sk_matrix_translate_sse2 LABEL PROC PUBLIC _sk_matrix_scale_translate_sse2 _sk_matrix_scale_translate_sse2 LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax - DB 243,68,15,16,64,8 ; movss 0x8(%rax),%xmm8 + DB 243,68,15,16,0 ; movss (%rax),%xmm8 + DB 243,68,15,16,72,4 ; movss 0x4(%rax),%xmm9 DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8 - DB 243,68,15,16,8 ; movss (%rax),%xmm9 - DB 243,68,15,16,80,4 ; movss 0x4(%rax),%xmm10 - DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9 + DB 243,68,15,16,80,8 ; movss 0x8(%rax),%xmm10 + DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10 DB 65,15,89,192 ; mulps %xmm8,%xmm0 - DB 65,15,88,193 ; addps %xmm9,%xmm0 + DB 65,15,88,194 ; addps %xmm10,%xmm0 + DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9 DB 243,68,15,16,64,12 ; movss 0xc(%rax),%xmm8 DB 69,15,198,192,0 ; shufps $0x0,%xmm8,%xmm8 - DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10 - DB 65,15,89,200 ; mulps %xmm8,%xmm1 - DB 65,15,88,202 ; addps %xmm10,%xmm1 + DB 65,15,89,201 ; mulps %xmm9,%xmm1 + DB 65,15,88,200 ; addps %xmm8,%xmm1 DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax @@ -51729,18 +51729,18 @@ _sk_matrix_scale_translate_sse2 LABEL PROC DB 131,236,8 ; sub $0x8,%esp DB 139,69,12 ; mov 0xc(%ebp),%eax DB 139,8 ; mov (%eax),%ecx - DB 243,15,16,97,8 ; movss 0x8(%ecx),%xmm4 + DB 243,15,16,33 ; movss (%ecx),%xmm4 + DB 243,15,16,105,4 ; movss 0x4(%ecx),%xmm5 DB 15,198,228,0 ; shufps $0x0,%xmm4,%xmm4 - DB 243,15,16,41 ; movss (%ecx),%xmm5 - DB 243,15,16,113,4 ; movss 0x4(%ecx),%xmm6 - DB 15,198,237,0 ; shufps $0x0,%xmm5,%xmm5 + DB 243,15,16,113,8 ; movss 0x8(%ecx),%xmm6 + DB 15,198,246,0 ; shufps $0x0,%xmm6,%xmm6 DB 15,89,196 ; mulps %xmm4,%xmm0 - DB 15,88,197 ; addps %xmm5,%xmm0 + DB 15,88,198 ; addps %xmm6,%xmm0 + DB 15,198,237,0 ; shufps $0x0,%xmm5,%xmm5 DB 243,15,16,97,12 ; movss 0xc(%ecx),%xmm4 DB 15,198,228,0 ; shufps $0x0,%xmm4,%xmm4 - DB 15,198,246,0 ; shufps $0x0,%xmm6,%xmm6 - DB 15,89,204 ; mulps %xmm4,%xmm1 - DB 15,88,206 ; addps %xmm6,%xmm1 + DB 15,89,205 ; mulps %xmm5,%xmm1 + DB 15,88,204 ; addps %xmm4,%xmm1 DB 141,72,8 ; lea 0x8(%eax),%ecx DB 131,236,8 ; sub $0x8,%esp DB 81 ; push %ecx diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index 690ba574ef..f2c701020f 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -1091,8 +1091,8 @@ STAGE(matrix_translate, const float* m) { g += m[1]; } STAGE(matrix_scale_translate, const float* m) { - r = mad(r,m[2], m[0]); - g = mad(g,m[3], m[1]); + r = mad(r,m[0], m[2]); + g = mad(g,m[1], m[3]); } STAGE(matrix_2x3, const float* m) { auto R = mad(r,m[0], mad(g,m[2], m[4])), -- cgit v1.2.3