diff options
author | Mike Klein <mtklein@chromium.org> | 2017-06-04 18:57:48 -0400 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2017-06-05 16:01:16 +0000 |
commit | 1f29bf093f01e9c9bf79cbd2ac27da62efc8e3a4 (patch) | |
tree | a823f2f44117844723463d47e6be325a7abfd026 /src | |
parent | a86385d2ff1b7db2abc19316d3e30aba54fa20fa (diff) |
slight streamlining for lowp load_8888 with pshufb
We can use 2 pshufb to replace 4 unpacks when deinterlacing the colors.
Change-Id: I713fbbc94f5cb9eaf14f85323b0ec76dc2246e98
Reviewed-on: https://skia-review.googlesource.com/18531
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Herb Derby <herb@google.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/jumper/SkJumper_generated.S | 104 | ||||
-rw-r--r-- | src/jumper/SkJumper_generated_win.S | 104 | ||||
-rw-r--r-- | src/jumper/SkJumper_stages_lowp.cpp | 13 |
3 files changed, 112 insertions, 109 deletions
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S index 7565730544..52bf029be2 100644 --- a/src/jumper/SkJumper_generated.S +++ b/src/jumper/SkJumper_generated.S @@ -36950,27 +36950,24 @@ _sk_load_8888_ssse3_lowp: .byte 72,173 // lods %ds:(%rsi),%rax .byte 76,139,24 // mov (%rax),%r11 .byte 77,133,192 // test %r8,%r8 - .byte 117,120 // jne 12e <_sk_load_8888_ssse3_lowp+0x82> - .byte 65,15,16,76,147,16 // movups 0x10(%r11,%rdx,4),%xmm1 - .byte 65,15,16,4,147 // movups (%r11,%rdx,4),%xmm0 - .byte 15,40,208 // movaps %xmm0,%xmm2 - .byte 102,15,96,209 // punpcklbw %xmm1,%xmm2 - .byte 102,15,104,193 // punpckhbw %xmm1,%xmm0 - .byte 102,68,15,111,194 // movdqa %xmm2,%xmm8 - .byte 102,68,15,96,192 // punpcklbw %xmm0,%xmm8 - .byte 102,15,104,208 // punpckhbw %xmm0,%xmm2 - .byte 102,69,15,111,200 // movdqa %xmm8,%xmm9 - .byte 102,68,15,96,202 // punpcklbw %xmm2,%xmm9 - .byte 102,68,15,104,194 // punpckhbw %xmm2,%xmm8 + .byte 117,113 // jne 127 <_sk_load_8888_ssse3_lowp+0x7b> + .byte 69,15,16,76,147,16 // movups 0x10(%r11,%rdx,4),%xmm9 + .byte 69,15,16,4,147 // movups (%r11,%rdx,4),%xmm8 + .byte 102,15,111,5,55,2,0,0 // movdqa 0x237(%rip),%xmm0 # 300 <_sk_swap_rb_ssse3_lowp+0x14> + .byte 102,68,15,56,0,192 // pshufb %xmm0,%xmm8 + .byte 102,68,15,56,0,200 // pshufb %xmm0,%xmm9 + .byte 102,65,15,111,208 // movdqa %xmm8,%xmm2 + .byte 102,65,15,98,209 // punpckldq %xmm9,%xmm2 .byte 102,15,239,219 // pxor %xmm3,%xmm3 .byte 102,15,239,192 // pxor %xmm0,%xmm0 - .byte 102,65,15,96,193 // punpcklbw %xmm9,%xmm0 + .byte 102,15,96,194 // punpcklbw %xmm2,%xmm0 .byte 102,15,239,201 // pxor %xmm1,%xmm1 - .byte 102,65,15,104,201 // punpckhbw %xmm9,%xmm1 + .byte 102,15,104,202 // punpckhbw %xmm2,%xmm1 + .byte 102,69,15,106,193 // punpckhdq %xmm9,%xmm8 .byte 102,15,239,210 // pxor %xmm2,%xmm2 .byte 102,65,15,96,208 // punpcklbw %xmm8,%xmm2 .byte 102,65,15,104,216 // punpckhbw %xmm8,%xmm3 - .byte 102,68,15,111,5,234,1,0,0 // movdqa 0x1ea(%rip),%xmm8 # 300 <_sk_swap_rb_ssse3_lowp+0x18> + .byte 102,68,15,111,5,1,2,0,0 // movdqa 0x201(%rip),%xmm8 # 310 <_sk_swap_rb_ssse3_lowp+0x24> .byte 102,65,15,228,192 // pmulhuw %xmm8,%xmm0 .byte 102,65,15,228,200 // pmulhuw %xmm8,%xmm1 .byte 102,65,15,228,208 // pmulhuw %xmm8,%xmm2 @@ -36979,53 +36976,52 @@ _sk_load_8888_ssse3_lowp: .byte 255,224 // jmpq *%rax .byte 69,137,193 // mov %r8d,%r9d .byte 65,128,225,7 // and $0x7,%r9b - .byte 102,15,239,201 // pxor %xmm1,%xmm1 - .byte 102,15,239,192 // pxor %xmm0,%xmm0 + .byte 102,69,15,239,201 // pxor %xmm9,%xmm9 + .byte 69,15,87,192 // xorps %xmm8,%xmm8 .byte 65,254,201 // dec %r9b .byte 65,128,249,6 // cmp $0x6,%r9b - .byte 15,135,119,255,255,255 // ja c1 <_sk_load_8888_ssse3_lowp+0x15> + .byte 119,129 // ja c1 <_sk_load_8888_ssse3_lowp+0x15> .byte 69,15,182,201 // movzbl %r9b,%r9d - .byte 76,141,21,119,0,0,0 // lea 0x77(%rip),%r10 # 1cc <_sk_load_8888_ssse3_lowp+0x120> + .byte 76,141,21,133,0,0,0 // lea 0x85(%rip),%r10 # 1d0 <_sk_load_8888_ssse3_lowp+0x124> .byte 75,99,4,138 // movslq (%r10,%r9,4),%rax .byte 76,1,208 // add %r10,%rax .byte 255,224 // jmpq *%rax .byte 102,65,15,110,68,147,24 // movd 0x18(%r11,%rdx,4),%xmm0 - .byte 102,15,112,200,69 // pshufd $0x45,%xmm0,%xmm1 - .byte 102,15,239,192 // pxor %xmm0,%xmm0 - .byte 243,65,15,16,84,147,20 // movss 0x14(%r11,%rdx,4),%xmm2 - .byte 15,198,209,0 // shufps $0x0,%xmm1,%xmm2 - .byte 15,198,209,226 // shufps $0xe2,%xmm1,%xmm2 - .byte 15,40,202 // movaps %xmm2,%xmm1 - .byte 243,65,15,16,84,147,16 // movss 0x10(%r11,%rdx,4),%xmm2 - .byte 243,15,16,202 // movss %xmm2,%xmm1 - .byte 243,65,15,16,84,147,12 // movss 0xc(%r11,%rdx,4),%xmm2 - .byte 15,198,208,32 // shufps $0x20,%xmm0,%xmm2 - .byte 15,198,194,36 // shufps $0x24,%xmm2,%xmm0 - .byte 243,65,15,16,84,147,8 // movss 0x8(%r11,%rdx,4),%xmm2 - .byte 15,198,208,48 // shufps $0x30,%xmm0,%xmm2 - .byte 15,198,194,132 // shufps $0x84,%xmm2,%xmm0 - .byte 243,65,15,16,84,147,4 // movss 0x4(%r11,%rdx,4),%xmm2 - .byte 15,198,208,0 // shufps $0x0,%xmm0,%xmm2 - .byte 15,198,208,226 // shufps $0xe2,%xmm0,%xmm2 - .byte 15,40,194 // movaps %xmm2,%xmm0 - .byte 243,65,15,16,20,147 // movss (%r11,%rdx,4),%xmm2 - .byte 243,15,16,194 // movss %xmm2,%xmm0 - .byte 233,247,254,255,255 // jmpq c1 <_sk_load_8888_ssse3_lowp+0x15> - .byte 102,144 // xchg %ax,%ax - .byte 239 // out %eax,(%dx) - .byte 255 // (bad) + .byte 102,68,15,112,200,69 // pshufd $0x45,%xmm0,%xmm9 + .byte 69,15,87,192 // xorps %xmm8,%xmm8 + .byte 243,65,15,16,68,147,20 // movss 0x14(%r11,%rdx,4),%xmm0 + .byte 65,15,198,193,0 // shufps $0x0,%xmm9,%xmm0 + .byte 65,15,198,193,226 // shufps $0xe2,%xmm9,%xmm0 + .byte 68,15,40,200 // movaps %xmm0,%xmm9 + .byte 243,65,15,16,68,147,16 // movss 0x10(%r11,%rdx,4),%xmm0 + .byte 243,68,15,16,200 // movss %xmm0,%xmm9 + .byte 243,65,15,16,68,147,12 // movss 0xc(%r11,%rdx,4),%xmm0 + .byte 65,15,198,192,32 // shufps $0x20,%xmm8,%xmm0 + .byte 68,15,198,192,36 // shufps $0x24,%xmm0,%xmm8 + .byte 243,65,15,16,68,147,8 // movss 0x8(%r11,%rdx,4),%xmm0 + .byte 65,15,198,192,48 // shufps $0x30,%xmm8,%xmm0 + .byte 68,15,198,192,132 // shufps $0x84,%xmm0,%xmm8 + .byte 243,65,15,16,68,147,4 // movss 0x4(%r11,%rdx,4),%xmm0 + .byte 65,15,198,192,0 // shufps $0x0,%xmm8,%xmm0 + .byte 65,15,198,192,226 // shufps $0xe2,%xmm8,%xmm0 + .byte 68,15,40,192 // movaps %xmm0,%xmm8 + .byte 243,65,15,16,4,147 // movss (%r11,%rdx,4),%xmm0 + .byte 243,68,15,16,192 // movss %xmm0,%xmm8 + .byte 233,244,254,255,255 // jmpq c1 <_sk_load_8888_ssse3_lowp+0x15> + .byte 15,31,0 // nopl (%rax) + .byte 237 // in (%dx),%eax .byte 255 // (bad) .byte 255 // (bad) - .byte 221,255 // (bad) .byte 255 // (bad) - .byte 255,206 // dec %esi + .byte 216,255 // fdivr %st(7),%st .byte 255 // (bad) + .byte 255,199 // inc %edi .byte 255 // (bad) .byte 255 // (bad) - .byte 191,255,255,255,180 // mov $0xb4ffffff,%edi + .byte 255,182,255,255,255,170 // pushq -0x55000001(%rsi) .byte 255 // (bad) .byte 255 // (bad) - .byte 255,162,255,255,255,146 // jmpq *-0x6d000001(%rdx) + .byte 255,149,255,255,255,132 // callq *-0x7b000001(%rbp) .byte 255 // (bad) .byte 255 // (bad) .byte 255 // .byte 0xff @@ -37053,7 +37049,7 @@ _sk_store_8888_ssse3_lowp: .byte 102,69,15,97,194 // punpcklwd %xmm10,%xmm8 .byte 102,69,15,105,202 // punpckhwd %xmm10,%xmm9 .byte 77,133,192 // test %r8,%r8 - .byte 117,17 // jne 257 <_sk_store_8888_ssse3_lowp+0x6f> + .byte 117,17 // jne 25b <_sk_store_8888_ssse3_lowp+0x6f> .byte 243,69,15,127,76,147,16 // movdqu %xmm9,0x10(%r11,%rdx,4) .byte 243,69,15,127,4,147 // movdqu %xmm8,(%r11,%rdx,4) .byte 72,173 // lods %ds:(%rsi),%rax @@ -37062,9 +37058,9 @@ _sk_store_8888_ssse3_lowp: .byte 65,128,225,7 // and $0x7,%r9b .byte 65,254,201 // dec %r9b .byte 65,128,249,6 // cmp $0x6,%r9b - .byte 119,236 // ja 253 <_sk_store_8888_ssse3_lowp+0x6b> + .byte 119,236 // ja 257 <_sk_store_8888_ssse3_lowp+0x6b> .byte 69,15,182,201 // movzbl %r9b,%r9d - .byte 76,141,21,90,0,0,0 // lea 0x5a(%rip),%r10 # 2cc <_sk_store_8888_ssse3_lowp+0xe4> + .byte 76,141,21,90,0,0,0 // lea 0x5a(%rip),%r10 # 2d0 <_sk_store_8888_ssse3_lowp+0xe4> .byte 75,99,4,138 // movslq (%r10,%r9,4),%rax .byte 76,1,208 // add %r10,%rax .byte 255,224 // jmpq *%rax @@ -37080,7 +37076,7 @@ _sk_store_8888_ssse3_lowp: .byte 102,69,15,112,200,229 // pshufd $0xe5,%xmm8,%xmm9 .byte 102,69,15,126,76,147,4 // movd %xmm9,0x4(%r11,%rdx,4) .byte 102,69,15,126,4,147 // movd %xmm8,(%r11,%rdx,4) - .byte 235,136 // jmp 253 <_sk_store_8888_ssse3_lowp+0x6b> + .byte 235,136 // jmp 257 <_sk_store_8888_ssse3_lowp+0x6b> .byte 144 // nop .byte 247,255 // idiv %edi .byte 255 // (bad) @@ -37114,6 +37110,12 @@ _sk_swap_rb_ssse3_lowp: .byte 255,224 // jmpq *%rax BALIGN16 + .byte 0,4,8 // add %al,(%rax,%rcx,1) + .byte 12,1 // or $0x1,%al + .byte 5,9,13,2,6 // add $0x6020d09,%eax + .byte 10,14 // or (%rsi),%cl + .byte 3,7 // add (%rdi),%eax + .byte 11,15 // or (%rdi),%ecx .byte 129,128,129,128,129,128,129,128,129,128// addl $0x80818081,-0x7f7e7f7f(%rax) .byte 129 // .byte 0x81 .byte 128 // .byte 0x80 diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S index 45b96f6495..ecbffaa201 100644 --- a/src/jumper/SkJumper_generated_win.S +++ b/src/jumper/SkJumper_generated_win.S @@ -26363,27 +26363,24 @@ _sk_load_8888_ssse3_lowp LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax DB 76,139,24 ; mov (%rax),%r11 DB 77,133,192 ; test %r8,%r8 - DB 117,120 ; jne 1be <_sk_load_8888_ssse3_lowp+0x82> - DB 65,15,16,76,147,16 ; movups 0x10(%r11,%rdx,4),%xmm1 - DB 65,15,16,4,147 ; movups (%r11,%rdx,4),%xmm0 - DB 15,40,208 ; movaps %xmm0,%xmm2 - DB 102,15,96,209 ; punpcklbw %xmm1,%xmm2 - DB 102,15,104,193 ; punpckhbw %xmm1,%xmm0 - DB 102,68,15,111,194 ; movdqa %xmm2,%xmm8 - DB 102,68,15,96,192 ; punpcklbw %xmm0,%xmm8 - DB 102,15,104,208 ; punpckhbw %xmm0,%xmm2 - DB 102,69,15,111,200 ; movdqa %xmm8,%xmm9 - DB 102,68,15,96,202 ; punpcklbw %xmm2,%xmm9 - DB 102,68,15,104,194 ; punpckhbw %xmm2,%xmm8 + DB 117,113 ; jne 1b7 <_sk_load_8888_ssse3_lowp+0x7b> + DB 69,15,16,76,147,16 ; movups 0x10(%r11,%rdx,4),%xmm9 + DB 69,15,16,4,147 ; movups (%r11,%rdx,4),%xmm8 + DB 102,15,111,5,55,2,0,0 ; movdqa 0x237(%rip),%xmm0 # 390 <_sk_swap_rb_ssse3_lowp+0x14> + DB 102,68,15,56,0,192 ; pshufb %xmm0,%xmm8 + DB 102,68,15,56,0,200 ; pshufb %xmm0,%xmm9 + DB 102,65,15,111,208 ; movdqa %xmm8,%xmm2 + DB 102,65,15,98,209 ; punpckldq %xmm9,%xmm2 DB 102,15,239,219 ; pxor %xmm3,%xmm3 DB 102,15,239,192 ; pxor %xmm0,%xmm0 - DB 102,65,15,96,193 ; punpcklbw %xmm9,%xmm0 + DB 102,15,96,194 ; punpcklbw %xmm2,%xmm0 DB 102,15,239,201 ; pxor %xmm1,%xmm1 - DB 102,65,15,104,201 ; punpckhbw %xmm9,%xmm1 + DB 102,15,104,202 ; punpckhbw %xmm2,%xmm1 + DB 102,69,15,106,193 ; punpckhdq %xmm9,%xmm8 DB 102,15,239,210 ; pxor %xmm2,%xmm2 DB 102,65,15,96,208 ; punpcklbw %xmm8,%xmm2 DB 102,65,15,104,216 ; punpckhbw %xmm8,%xmm3 - DB 102,68,15,111,5,234,1,0,0 ; movdqa 0x1ea(%rip),%xmm8 # 390 <_sk_swap_rb_ssse3_lowp+0x18> + DB 102,68,15,111,5,1,2,0,0 ; movdqa 0x201(%rip),%xmm8 # 3a0 <_sk_swap_rb_ssse3_lowp+0x24> DB 102,65,15,228,192 ; pmulhuw %xmm8,%xmm0 DB 102,65,15,228,200 ; pmulhuw %xmm8,%xmm1 DB 102,65,15,228,208 ; pmulhuw %xmm8,%xmm2 @@ -26392,53 +26389,52 @@ _sk_load_8888_ssse3_lowp LABEL PROC DB 255,224 ; jmpq *%rax DB 69,137,193 ; mov %r8d,%r9d DB 65,128,225,7 ; and $0x7,%r9b - DB 102,15,239,201 ; pxor %xmm1,%xmm1 - DB 102,15,239,192 ; pxor %xmm0,%xmm0 + DB 102,69,15,239,201 ; pxor %xmm9,%xmm9 + DB 69,15,87,192 ; xorps %xmm8,%xmm8 DB 65,254,201 ; dec %r9b DB 65,128,249,6 ; cmp $0x6,%r9b - DB 15,135,119,255,255,255 ; ja 151 <_sk_load_8888_ssse3_lowp+0x15> + DB 119,129 ; ja 151 <_sk_load_8888_ssse3_lowp+0x15> DB 69,15,182,201 ; movzbl %r9b,%r9d - DB 76,141,21,119,0,0,0 ; lea 0x77(%rip),%r10 # 25c <_sk_load_8888_ssse3_lowp+0x120> + DB 76,141,21,133,0,0,0 ; lea 0x85(%rip),%r10 # 260 <_sk_load_8888_ssse3_lowp+0x124> DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax DB 76,1,208 ; add %r10,%rax DB 255,224 ; jmpq *%rax DB 102,65,15,110,68,147,24 ; movd 0x18(%r11,%rdx,4),%xmm0 - DB 102,15,112,200,69 ; pshufd $0x45,%xmm0,%xmm1 - DB 102,15,239,192 ; pxor %xmm0,%xmm0 - DB 243,65,15,16,84,147,20 ; movss 0x14(%r11,%rdx,4),%xmm2 - DB 15,198,209,0 ; shufps $0x0,%xmm1,%xmm2 - DB 15,198,209,226 ; shufps $0xe2,%xmm1,%xmm2 - DB 15,40,202 ; movaps %xmm2,%xmm1 - DB 243,65,15,16,84,147,16 ; movss 0x10(%r11,%rdx,4),%xmm2 - DB 243,15,16,202 ; movss %xmm2,%xmm1 - DB 243,65,15,16,84,147,12 ; movss 0xc(%r11,%rdx,4),%xmm2 - DB 15,198,208,32 ; shufps $0x20,%xmm0,%xmm2 - DB 15,198,194,36 ; shufps $0x24,%xmm2,%xmm0 - DB 243,65,15,16,84,147,8 ; movss 0x8(%r11,%rdx,4),%xmm2 - DB 15,198,208,48 ; shufps $0x30,%xmm0,%xmm2 - DB 15,198,194,132 ; shufps $0x84,%xmm2,%xmm0 - DB 243,65,15,16,84,147,4 ; movss 0x4(%r11,%rdx,4),%xmm2 - DB 15,198,208,0 ; shufps $0x0,%xmm0,%xmm2 - DB 15,198,208,226 ; shufps $0xe2,%xmm0,%xmm2 - DB 15,40,194 ; movaps %xmm2,%xmm0 - DB 243,65,15,16,20,147 ; movss (%r11,%rdx,4),%xmm2 - DB 243,15,16,194 ; movss %xmm2,%xmm0 - DB 233,247,254,255,255 ; jmpq 151 <_sk_load_8888_ssse3_lowp+0x15> - DB 102,144 ; xchg %ax,%ax - DB 239 ; out %eax,(%dx) - DB 255 ; (bad) + DB 102,68,15,112,200,69 ; pshufd $0x45,%xmm0,%xmm9 + DB 69,15,87,192 ; xorps %xmm8,%xmm8 + DB 243,65,15,16,68,147,20 ; movss 0x14(%r11,%rdx,4),%xmm0 + DB 65,15,198,193,0 ; shufps $0x0,%xmm9,%xmm0 + DB 65,15,198,193,226 ; shufps $0xe2,%xmm9,%xmm0 + DB 68,15,40,200 ; movaps %xmm0,%xmm9 + DB 243,65,15,16,68,147,16 ; movss 0x10(%r11,%rdx,4),%xmm0 + DB 243,68,15,16,200 ; movss %xmm0,%xmm9 + DB 243,65,15,16,68,147,12 ; movss 0xc(%r11,%rdx,4),%xmm0 + DB 65,15,198,192,32 ; shufps $0x20,%xmm8,%xmm0 + DB 68,15,198,192,36 ; shufps $0x24,%xmm0,%xmm8 + DB 243,65,15,16,68,147,8 ; movss 0x8(%r11,%rdx,4),%xmm0 + DB 65,15,198,192,48 ; shufps $0x30,%xmm8,%xmm0 + DB 68,15,198,192,132 ; shufps $0x84,%xmm0,%xmm8 + DB 243,65,15,16,68,147,4 ; movss 0x4(%r11,%rdx,4),%xmm0 + DB 65,15,198,192,0 ; shufps $0x0,%xmm8,%xmm0 + DB 65,15,198,192,226 ; shufps $0xe2,%xmm8,%xmm0 + DB 68,15,40,192 ; movaps %xmm0,%xmm8 + DB 243,65,15,16,4,147 ; movss (%r11,%rdx,4),%xmm0 + DB 243,68,15,16,192 ; movss %xmm0,%xmm8 + DB 233,244,254,255,255 ; jmpq 151 <_sk_load_8888_ssse3_lowp+0x15> + DB 15,31,0 ; nopl (%rax) + DB 237 ; in (%dx),%eax DB 255 ; (bad) DB 255 ; (bad) - DB 221,255 ; (bad) DB 255 ; (bad) - DB 255,206 ; dec %esi + DB 216,255 ; fdivr %st(7),%st DB 255 ; (bad) + DB 255,199 ; inc %edi DB 255 ; (bad) DB 255 ; (bad) - DB 191,255,255,255,180 ; mov $0xb4ffffff,%edi + DB 255,182,255,255,255,170 ; pushq -0x55000001(%rsi) DB 255 ; (bad) DB 255 ; (bad) - DB 255,162,255,255,255,146 ; jmpq *-0x6d000001(%rdx) + DB 255,149,255,255,255,132 ; callq *-0x7b000001(%rbp) DB 255 ; (bad) DB 255 ; (bad) DB 255 ; .byte 0xff @@ -26464,7 +26460,7 @@ _sk_store_8888_ssse3_lowp LABEL PROC DB 102,69,15,97,194 ; punpcklwd %xmm10,%xmm8 DB 102,69,15,105,202 ; punpckhwd %xmm10,%xmm9 DB 77,133,192 ; test %r8,%r8 - DB 117,17 ; jne 2e7 <_sk_store_8888_ssse3_lowp+0x6f> + DB 117,17 ; jne 2eb <_sk_store_8888_ssse3_lowp+0x6f> DB 243,69,15,127,76,147,16 ; movdqu %xmm9,0x10(%r11,%rdx,4) DB 243,69,15,127,4,147 ; movdqu %xmm8,(%r11,%rdx,4) DB 72,173 ; lods %ds:(%rsi),%rax @@ -26473,9 +26469,9 @@ _sk_store_8888_ssse3_lowp LABEL PROC DB 65,128,225,7 ; and $0x7,%r9b DB 65,254,201 ; dec %r9b DB 65,128,249,6 ; cmp $0x6,%r9b - DB 119,236 ; ja 2e3 <_sk_store_8888_ssse3_lowp+0x6b> + DB 119,236 ; ja 2e7 <_sk_store_8888_ssse3_lowp+0x6b> DB 69,15,182,201 ; movzbl %r9b,%r9d - DB 76,141,21,90,0,0,0 ; lea 0x5a(%rip),%r10 # 35c <_sk_store_8888_ssse3_lowp+0xe4> + DB 76,141,21,90,0,0,0 ; lea 0x5a(%rip),%r10 # 360 <_sk_store_8888_ssse3_lowp+0xe4> DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax DB 76,1,208 ; add %r10,%rax DB 255,224 ; jmpq *%rax @@ -26491,7 +26487,7 @@ _sk_store_8888_ssse3_lowp LABEL PROC DB 102,69,15,112,200,229 ; pshufd $0xe5,%xmm8,%xmm9 DB 102,69,15,126,76,147,4 ; movd %xmm9,0x4(%r11,%rdx,4) DB 102,69,15,126,4,147 ; movd %xmm8,(%r11,%rdx,4) - DB 235,136 ; jmp 2e3 <_sk_store_8888_ssse3_lowp+0x6b> + DB 235,136 ; jmp 2e7 <_sk_store_8888_ssse3_lowp+0x6b> DB 144 ; nop DB 247,255 ; idiv %edi DB 255 ; (bad) @@ -26523,6 +26519,12 @@ _sk_swap_rb_ssse3_lowp LABEL PROC DB 255,224 ; jmpq *%rax ALIGN 16 + DB 0,4,8 ; add %al,(%rax,%rcx,1) + DB 12,1 ; or $0x1,%al + DB 5,9,13,2,6 ; add $0x6020d09,%eax + DB 10,14 ; or (%rsi),%cl + DB 3,7 ; add (%rdi),%eax + DB 11,15 ; or (%rdi),%ecx DB 129,128,129,128,129,128,129,128,129,128; addl $0x80818081,-0x7f7e7f7f(%rax) DB 129 ; .byte 0x81 DB 128 ; .byte 0x80 diff --git a/src/jumper/SkJumper_stages_lowp.cpp b/src/jumper/SkJumper_stages_lowp.cpp index 81520f16ab..4d0df94af6 100644 --- a/src/jumper/SkJumper_stages_lowp.cpp +++ b/src/jumper/SkJumper_stages_lowp.cpp @@ -121,14 +121,13 @@ SI void from_8888(U32 rgba, F* r, F* g, F* b, F* a) { U16 lo = unaligned_load<U16>((const uint32_t*)&rgba + 0), hi = unaligned_load<U16>((const uint32_t*)&rgba + 4); - U16 _0415 = _mm_unpacklo_epi8(lo, hi), // r0 r4 g0 g4 b0 b4 a0 a4 r1 r5 g1 g5 b1 b5 a1 a5 - _2637 = _mm_unpackhi_epi8(lo, hi); + // Shuffle so that the 4 bytes of each color channel are contiguous... + lo = _mm_shuffle_epi8(lo, _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15)); + hi = _mm_shuffle_epi8(hi, _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15)); - U16 even = _mm_unpacklo_epi8(_0415, _2637), // r0 r2 r4 r6 g0 g2 g4 g6 b0 b2 b4 b6 a0 a2 a4 a6 - odd = _mm_unpackhi_epi8(_0415, _2637); - - U16 rg = _mm_unpacklo_epi8(even, odd), // r0 r1 r2 r3 r4 r5 r6 r7 g0 g1 g2 g3 g4 g5 g6 g7 - ba = _mm_unpackhi_epi8(even, odd); + // ...then get all 8 bytes of each color channel together into a single register. + U16 rg = _mm_unpacklo_epi32(lo,hi), + ba = _mm_unpackhi_epi32(lo,hi); // Unpack as 16-bit values into the high half of each 16-bit lane, to get a free *256. U16 R = _mm_unpacklo_epi8(U16(0), rg), |