aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-06-04 18:57:48 -0400
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-06-05 16:01:16 +0000
commit1f29bf093f01e9c9bf79cbd2ac27da62efc8e3a4 (patch)
treea823f2f44117844723463d47e6be325a7abfd026 /src
parenta86385d2ff1b7db2abc19316d3e30aba54fa20fa (diff)
slight streamlining for lowp load_8888 with pshufb
We can use 2 pshufb to replace 4 unpacks when deinterlacing the colors. Change-Id: I713fbbc94f5cb9eaf14f85323b0ec76dc2246e98 Reviewed-on: https://skia-review.googlesource.com/18531 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Herb Derby <herb@google.com>
Diffstat (limited to 'src')
-rw-r--r--src/jumper/SkJumper_generated.S104
-rw-r--r--src/jumper/SkJumper_generated_win.S104
-rw-r--r--src/jumper/SkJumper_stages_lowp.cpp13
3 files changed, 112 insertions, 109 deletions
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index 7565730544..52bf029be2 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -36950,27 +36950,24 @@ _sk_load_8888_ssse3_lowp:
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 76,139,24 // mov (%rax),%r11
.byte 77,133,192 // test %r8,%r8
- .byte 117,120 // jne 12e <_sk_load_8888_ssse3_lowp+0x82>
- .byte 65,15,16,76,147,16 // movups 0x10(%r11,%rdx,4),%xmm1
- .byte 65,15,16,4,147 // movups (%r11,%rdx,4),%xmm0
- .byte 15,40,208 // movaps %xmm0,%xmm2
- .byte 102,15,96,209 // punpcklbw %xmm1,%xmm2
- .byte 102,15,104,193 // punpckhbw %xmm1,%xmm0
- .byte 102,68,15,111,194 // movdqa %xmm2,%xmm8
- .byte 102,68,15,96,192 // punpcklbw %xmm0,%xmm8
- .byte 102,15,104,208 // punpckhbw %xmm0,%xmm2
- .byte 102,69,15,111,200 // movdqa %xmm8,%xmm9
- .byte 102,68,15,96,202 // punpcklbw %xmm2,%xmm9
- .byte 102,68,15,104,194 // punpckhbw %xmm2,%xmm8
+ .byte 117,113 // jne 127 <_sk_load_8888_ssse3_lowp+0x7b>
+ .byte 69,15,16,76,147,16 // movups 0x10(%r11,%rdx,4),%xmm9
+ .byte 69,15,16,4,147 // movups (%r11,%rdx,4),%xmm8
+ .byte 102,15,111,5,55,2,0,0 // movdqa 0x237(%rip),%xmm0 # 300 <_sk_swap_rb_ssse3_lowp+0x14>
+ .byte 102,68,15,56,0,192 // pshufb %xmm0,%xmm8
+ .byte 102,68,15,56,0,200 // pshufb %xmm0,%xmm9
+ .byte 102,65,15,111,208 // movdqa %xmm8,%xmm2
+ .byte 102,65,15,98,209 // punpckldq %xmm9,%xmm2
.byte 102,15,239,219 // pxor %xmm3,%xmm3
.byte 102,15,239,192 // pxor %xmm0,%xmm0
- .byte 102,65,15,96,193 // punpcklbw %xmm9,%xmm0
+ .byte 102,15,96,194 // punpcklbw %xmm2,%xmm0
.byte 102,15,239,201 // pxor %xmm1,%xmm1
- .byte 102,65,15,104,201 // punpckhbw %xmm9,%xmm1
+ .byte 102,15,104,202 // punpckhbw %xmm2,%xmm1
+ .byte 102,69,15,106,193 // punpckhdq %xmm9,%xmm8
.byte 102,15,239,210 // pxor %xmm2,%xmm2
.byte 102,65,15,96,208 // punpcklbw %xmm8,%xmm2
.byte 102,65,15,104,216 // punpckhbw %xmm8,%xmm3
- .byte 102,68,15,111,5,234,1,0,0 // movdqa 0x1ea(%rip),%xmm8 # 300 <_sk_swap_rb_ssse3_lowp+0x18>
+ .byte 102,68,15,111,5,1,2,0,0 // movdqa 0x201(%rip),%xmm8 # 310 <_sk_swap_rb_ssse3_lowp+0x24>
.byte 102,65,15,228,192 // pmulhuw %xmm8,%xmm0
.byte 102,65,15,228,200 // pmulhuw %xmm8,%xmm1
.byte 102,65,15,228,208 // pmulhuw %xmm8,%xmm2
@@ -36979,53 +36976,52 @@ _sk_load_8888_ssse3_lowp:
.byte 255,224 // jmpq *%rax
.byte 69,137,193 // mov %r8d,%r9d
.byte 65,128,225,7 // and $0x7,%r9b
- .byte 102,15,239,201 // pxor %xmm1,%xmm1
- .byte 102,15,239,192 // pxor %xmm0,%xmm0
+ .byte 102,69,15,239,201 // pxor %xmm9,%xmm9
+ .byte 69,15,87,192 // xorps %xmm8,%xmm8
.byte 65,254,201 // dec %r9b
.byte 65,128,249,6 // cmp $0x6,%r9b
- .byte 15,135,119,255,255,255 // ja c1 <_sk_load_8888_ssse3_lowp+0x15>
+ .byte 119,129 // ja c1 <_sk_load_8888_ssse3_lowp+0x15>
.byte 69,15,182,201 // movzbl %r9b,%r9d
- .byte 76,141,21,119,0,0,0 // lea 0x77(%rip),%r10 # 1cc <_sk_load_8888_ssse3_lowp+0x120>
+ .byte 76,141,21,133,0,0,0 // lea 0x85(%rip),%r10 # 1d0 <_sk_load_8888_ssse3_lowp+0x124>
.byte 75,99,4,138 // movslq (%r10,%r9,4),%rax
.byte 76,1,208 // add %r10,%rax
.byte 255,224 // jmpq *%rax
.byte 102,65,15,110,68,147,24 // movd 0x18(%r11,%rdx,4),%xmm0
- .byte 102,15,112,200,69 // pshufd $0x45,%xmm0,%xmm1
- .byte 102,15,239,192 // pxor %xmm0,%xmm0
- .byte 243,65,15,16,84,147,20 // movss 0x14(%r11,%rdx,4),%xmm2
- .byte 15,198,209,0 // shufps $0x0,%xmm1,%xmm2
- .byte 15,198,209,226 // shufps $0xe2,%xmm1,%xmm2
- .byte 15,40,202 // movaps %xmm2,%xmm1
- .byte 243,65,15,16,84,147,16 // movss 0x10(%r11,%rdx,4),%xmm2
- .byte 243,15,16,202 // movss %xmm2,%xmm1
- .byte 243,65,15,16,84,147,12 // movss 0xc(%r11,%rdx,4),%xmm2
- .byte 15,198,208,32 // shufps $0x20,%xmm0,%xmm2
- .byte 15,198,194,36 // shufps $0x24,%xmm2,%xmm0
- .byte 243,65,15,16,84,147,8 // movss 0x8(%r11,%rdx,4),%xmm2
- .byte 15,198,208,48 // shufps $0x30,%xmm0,%xmm2
- .byte 15,198,194,132 // shufps $0x84,%xmm2,%xmm0
- .byte 243,65,15,16,84,147,4 // movss 0x4(%r11,%rdx,4),%xmm2
- .byte 15,198,208,0 // shufps $0x0,%xmm0,%xmm2
- .byte 15,198,208,226 // shufps $0xe2,%xmm0,%xmm2
- .byte 15,40,194 // movaps %xmm2,%xmm0
- .byte 243,65,15,16,20,147 // movss (%r11,%rdx,4),%xmm2
- .byte 243,15,16,194 // movss %xmm2,%xmm0
- .byte 233,247,254,255,255 // jmpq c1 <_sk_load_8888_ssse3_lowp+0x15>
- .byte 102,144 // xchg %ax,%ax
- .byte 239 // out %eax,(%dx)
- .byte 255 // (bad)
+ .byte 102,68,15,112,200,69 // pshufd $0x45,%xmm0,%xmm9
+ .byte 69,15,87,192 // xorps %xmm8,%xmm8
+ .byte 243,65,15,16,68,147,20 // movss 0x14(%r11,%rdx,4),%xmm0
+ .byte 65,15,198,193,0 // shufps $0x0,%xmm9,%xmm0
+ .byte 65,15,198,193,226 // shufps $0xe2,%xmm9,%xmm0
+ .byte 68,15,40,200 // movaps %xmm0,%xmm9
+ .byte 243,65,15,16,68,147,16 // movss 0x10(%r11,%rdx,4),%xmm0
+ .byte 243,68,15,16,200 // movss %xmm0,%xmm9
+ .byte 243,65,15,16,68,147,12 // movss 0xc(%r11,%rdx,4),%xmm0
+ .byte 65,15,198,192,32 // shufps $0x20,%xmm8,%xmm0
+ .byte 68,15,198,192,36 // shufps $0x24,%xmm0,%xmm8
+ .byte 243,65,15,16,68,147,8 // movss 0x8(%r11,%rdx,4),%xmm0
+ .byte 65,15,198,192,48 // shufps $0x30,%xmm8,%xmm0
+ .byte 68,15,198,192,132 // shufps $0x84,%xmm0,%xmm8
+ .byte 243,65,15,16,68,147,4 // movss 0x4(%r11,%rdx,4),%xmm0
+ .byte 65,15,198,192,0 // shufps $0x0,%xmm8,%xmm0
+ .byte 65,15,198,192,226 // shufps $0xe2,%xmm8,%xmm0
+ .byte 68,15,40,192 // movaps %xmm0,%xmm8
+ .byte 243,65,15,16,4,147 // movss (%r11,%rdx,4),%xmm0
+ .byte 243,68,15,16,192 // movss %xmm0,%xmm8
+ .byte 233,244,254,255,255 // jmpq c1 <_sk_load_8888_ssse3_lowp+0x15>
+ .byte 15,31,0 // nopl (%rax)
+ .byte 237 // in (%dx),%eax
.byte 255 // (bad)
.byte 255 // (bad)
- .byte 221,255 // (bad)
.byte 255 // (bad)
- .byte 255,206 // dec %esi
+ .byte 216,255 // fdivr %st(7),%st
.byte 255 // (bad)
+ .byte 255,199 // inc %edi
.byte 255 // (bad)
.byte 255 // (bad)
- .byte 191,255,255,255,180 // mov $0xb4ffffff,%edi
+ .byte 255,182,255,255,255,170 // pushq -0x55000001(%rsi)
.byte 255 // (bad)
.byte 255 // (bad)
- .byte 255,162,255,255,255,146 // jmpq *-0x6d000001(%rdx)
+ .byte 255,149,255,255,255,132 // callq *-0x7b000001(%rbp)
.byte 255 // (bad)
.byte 255 // (bad)
.byte 255 // .byte 0xff
@@ -37053,7 +37049,7 @@ _sk_store_8888_ssse3_lowp:
.byte 102,69,15,97,194 // punpcklwd %xmm10,%xmm8
.byte 102,69,15,105,202 // punpckhwd %xmm10,%xmm9
.byte 77,133,192 // test %r8,%r8
- .byte 117,17 // jne 257 <_sk_store_8888_ssse3_lowp+0x6f>
+ .byte 117,17 // jne 25b <_sk_store_8888_ssse3_lowp+0x6f>
.byte 243,69,15,127,76,147,16 // movdqu %xmm9,0x10(%r11,%rdx,4)
.byte 243,69,15,127,4,147 // movdqu %xmm8,(%r11,%rdx,4)
.byte 72,173 // lods %ds:(%rsi),%rax
@@ -37062,9 +37058,9 @@ _sk_store_8888_ssse3_lowp:
.byte 65,128,225,7 // and $0x7,%r9b
.byte 65,254,201 // dec %r9b
.byte 65,128,249,6 // cmp $0x6,%r9b
- .byte 119,236 // ja 253 <_sk_store_8888_ssse3_lowp+0x6b>
+ .byte 119,236 // ja 257 <_sk_store_8888_ssse3_lowp+0x6b>
.byte 69,15,182,201 // movzbl %r9b,%r9d
- .byte 76,141,21,90,0,0,0 // lea 0x5a(%rip),%r10 # 2cc <_sk_store_8888_ssse3_lowp+0xe4>
+ .byte 76,141,21,90,0,0,0 // lea 0x5a(%rip),%r10 # 2d0 <_sk_store_8888_ssse3_lowp+0xe4>
.byte 75,99,4,138 // movslq (%r10,%r9,4),%rax
.byte 76,1,208 // add %r10,%rax
.byte 255,224 // jmpq *%rax
@@ -37080,7 +37076,7 @@ _sk_store_8888_ssse3_lowp:
.byte 102,69,15,112,200,229 // pshufd $0xe5,%xmm8,%xmm9
.byte 102,69,15,126,76,147,4 // movd %xmm9,0x4(%r11,%rdx,4)
.byte 102,69,15,126,4,147 // movd %xmm8,(%r11,%rdx,4)
- .byte 235,136 // jmp 253 <_sk_store_8888_ssse3_lowp+0x6b>
+ .byte 235,136 // jmp 257 <_sk_store_8888_ssse3_lowp+0x6b>
.byte 144 // nop
.byte 247,255 // idiv %edi
.byte 255 // (bad)
@@ -37114,6 +37110,12 @@ _sk_swap_rb_ssse3_lowp:
.byte 255,224 // jmpq *%rax
BALIGN16
+ .byte 0,4,8 // add %al,(%rax,%rcx,1)
+ .byte 12,1 // or $0x1,%al
+ .byte 5,9,13,2,6 // add $0x6020d09,%eax
+ .byte 10,14 // or (%rsi),%cl
+ .byte 3,7 // add (%rdi),%eax
+ .byte 11,15 // or (%rdi),%ecx
.byte 129,128,129,128,129,128,129,128,129,128// addl $0x80818081,-0x7f7e7f7f(%rax)
.byte 129 // .byte 0x81
.byte 128 // .byte 0x80
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index 45b96f6495..ecbffaa201 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -26363,27 +26363,24 @@ _sk_load_8888_ssse3_lowp LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
DB 76,139,24 ; mov (%rax),%r11
DB 77,133,192 ; test %r8,%r8
- DB 117,120 ; jne 1be <_sk_load_8888_ssse3_lowp+0x82>
- DB 65,15,16,76,147,16 ; movups 0x10(%r11,%rdx,4),%xmm1
- DB 65,15,16,4,147 ; movups (%r11,%rdx,4),%xmm0
- DB 15,40,208 ; movaps %xmm0,%xmm2
- DB 102,15,96,209 ; punpcklbw %xmm1,%xmm2
- DB 102,15,104,193 ; punpckhbw %xmm1,%xmm0
- DB 102,68,15,111,194 ; movdqa %xmm2,%xmm8
- DB 102,68,15,96,192 ; punpcklbw %xmm0,%xmm8
- DB 102,15,104,208 ; punpckhbw %xmm0,%xmm2
- DB 102,69,15,111,200 ; movdqa %xmm8,%xmm9
- DB 102,68,15,96,202 ; punpcklbw %xmm2,%xmm9
- DB 102,68,15,104,194 ; punpckhbw %xmm2,%xmm8
+ DB 117,113 ; jne 1b7 <_sk_load_8888_ssse3_lowp+0x7b>
+ DB 69,15,16,76,147,16 ; movups 0x10(%r11,%rdx,4),%xmm9
+ DB 69,15,16,4,147 ; movups (%r11,%rdx,4),%xmm8
+ DB 102,15,111,5,55,2,0,0 ; movdqa 0x237(%rip),%xmm0 # 390 <_sk_swap_rb_ssse3_lowp+0x14>
+ DB 102,68,15,56,0,192 ; pshufb %xmm0,%xmm8
+ DB 102,68,15,56,0,200 ; pshufb %xmm0,%xmm9
+ DB 102,65,15,111,208 ; movdqa %xmm8,%xmm2
+ DB 102,65,15,98,209 ; punpckldq %xmm9,%xmm2
DB 102,15,239,219 ; pxor %xmm3,%xmm3
DB 102,15,239,192 ; pxor %xmm0,%xmm0
- DB 102,65,15,96,193 ; punpcklbw %xmm9,%xmm0
+ DB 102,15,96,194 ; punpcklbw %xmm2,%xmm0
DB 102,15,239,201 ; pxor %xmm1,%xmm1
- DB 102,65,15,104,201 ; punpckhbw %xmm9,%xmm1
+ DB 102,15,104,202 ; punpckhbw %xmm2,%xmm1
+ DB 102,69,15,106,193 ; punpckhdq %xmm9,%xmm8
DB 102,15,239,210 ; pxor %xmm2,%xmm2
DB 102,65,15,96,208 ; punpcklbw %xmm8,%xmm2
DB 102,65,15,104,216 ; punpckhbw %xmm8,%xmm3
- DB 102,68,15,111,5,234,1,0,0 ; movdqa 0x1ea(%rip),%xmm8 # 390 <_sk_swap_rb_ssse3_lowp+0x18>
+ DB 102,68,15,111,5,1,2,0,0 ; movdqa 0x201(%rip),%xmm8 # 3a0 <_sk_swap_rb_ssse3_lowp+0x24>
DB 102,65,15,228,192 ; pmulhuw %xmm8,%xmm0
DB 102,65,15,228,200 ; pmulhuw %xmm8,%xmm1
DB 102,65,15,228,208 ; pmulhuw %xmm8,%xmm2
@@ -26392,53 +26389,52 @@ _sk_load_8888_ssse3_lowp LABEL PROC
DB 255,224 ; jmpq *%rax
DB 69,137,193 ; mov %r8d,%r9d
DB 65,128,225,7 ; and $0x7,%r9b
- DB 102,15,239,201 ; pxor %xmm1,%xmm1
- DB 102,15,239,192 ; pxor %xmm0,%xmm0
+ DB 102,69,15,239,201 ; pxor %xmm9,%xmm9
+ DB 69,15,87,192 ; xorps %xmm8,%xmm8
DB 65,254,201 ; dec %r9b
DB 65,128,249,6 ; cmp $0x6,%r9b
- DB 15,135,119,255,255,255 ; ja 151 <_sk_load_8888_ssse3_lowp+0x15>
+ DB 119,129 ; ja 151 <_sk_load_8888_ssse3_lowp+0x15>
DB 69,15,182,201 ; movzbl %r9b,%r9d
- DB 76,141,21,119,0,0,0 ; lea 0x77(%rip),%r10 # 25c <_sk_load_8888_ssse3_lowp+0x120>
+ DB 76,141,21,133,0,0,0 ; lea 0x85(%rip),%r10 # 260 <_sk_load_8888_ssse3_lowp+0x124>
DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax
DB 76,1,208 ; add %r10,%rax
DB 255,224 ; jmpq *%rax
DB 102,65,15,110,68,147,24 ; movd 0x18(%r11,%rdx,4),%xmm0
- DB 102,15,112,200,69 ; pshufd $0x45,%xmm0,%xmm1
- DB 102,15,239,192 ; pxor %xmm0,%xmm0
- DB 243,65,15,16,84,147,20 ; movss 0x14(%r11,%rdx,4),%xmm2
- DB 15,198,209,0 ; shufps $0x0,%xmm1,%xmm2
- DB 15,198,209,226 ; shufps $0xe2,%xmm1,%xmm2
- DB 15,40,202 ; movaps %xmm2,%xmm1
- DB 243,65,15,16,84,147,16 ; movss 0x10(%r11,%rdx,4),%xmm2
- DB 243,15,16,202 ; movss %xmm2,%xmm1
- DB 243,65,15,16,84,147,12 ; movss 0xc(%r11,%rdx,4),%xmm2
- DB 15,198,208,32 ; shufps $0x20,%xmm0,%xmm2
- DB 15,198,194,36 ; shufps $0x24,%xmm2,%xmm0
- DB 243,65,15,16,84,147,8 ; movss 0x8(%r11,%rdx,4),%xmm2
- DB 15,198,208,48 ; shufps $0x30,%xmm0,%xmm2
- DB 15,198,194,132 ; shufps $0x84,%xmm2,%xmm0
- DB 243,65,15,16,84,147,4 ; movss 0x4(%r11,%rdx,4),%xmm2
- DB 15,198,208,0 ; shufps $0x0,%xmm0,%xmm2
- DB 15,198,208,226 ; shufps $0xe2,%xmm0,%xmm2
- DB 15,40,194 ; movaps %xmm2,%xmm0
- DB 243,65,15,16,20,147 ; movss (%r11,%rdx,4),%xmm2
- DB 243,15,16,194 ; movss %xmm2,%xmm0
- DB 233,247,254,255,255 ; jmpq 151 <_sk_load_8888_ssse3_lowp+0x15>
- DB 102,144 ; xchg %ax,%ax
- DB 239 ; out %eax,(%dx)
- DB 255 ; (bad)
+ DB 102,68,15,112,200,69 ; pshufd $0x45,%xmm0,%xmm9
+ DB 69,15,87,192 ; xorps %xmm8,%xmm8
+ DB 243,65,15,16,68,147,20 ; movss 0x14(%r11,%rdx,4),%xmm0
+ DB 65,15,198,193,0 ; shufps $0x0,%xmm9,%xmm0
+ DB 65,15,198,193,226 ; shufps $0xe2,%xmm9,%xmm0
+ DB 68,15,40,200 ; movaps %xmm0,%xmm9
+ DB 243,65,15,16,68,147,16 ; movss 0x10(%r11,%rdx,4),%xmm0
+ DB 243,68,15,16,200 ; movss %xmm0,%xmm9
+ DB 243,65,15,16,68,147,12 ; movss 0xc(%r11,%rdx,4),%xmm0
+ DB 65,15,198,192,32 ; shufps $0x20,%xmm8,%xmm0
+ DB 68,15,198,192,36 ; shufps $0x24,%xmm0,%xmm8
+ DB 243,65,15,16,68,147,8 ; movss 0x8(%r11,%rdx,4),%xmm0
+ DB 65,15,198,192,48 ; shufps $0x30,%xmm8,%xmm0
+ DB 68,15,198,192,132 ; shufps $0x84,%xmm0,%xmm8
+ DB 243,65,15,16,68,147,4 ; movss 0x4(%r11,%rdx,4),%xmm0
+ DB 65,15,198,192,0 ; shufps $0x0,%xmm8,%xmm0
+ DB 65,15,198,192,226 ; shufps $0xe2,%xmm8,%xmm0
+ DB 68,15,40,192 ; movaps %xmm0,%xmm8
+ DB 243,65,15,16,4,147 ; movss (%r11,%rdx,4),%xmm0
+ DB 243,68,15,16,192 ; movss %xmm0,%xmm8
+ DB 233,244,254,255,255 ; jmpq 151 <_sk_load_8888_ssse3_lowp+0x15>
+ DB 15,31,0 ; nopl (%rax)
+ DB 237 ; in (%dx),%eax
DB 255 ; (bad)
DB 255 ; (bad)
- DB 221,255 ; (bad)
DB 255 ; (bad)
- DB 255,206 ; dec %esi
+ DB 216,255 ; fdivr %st(7),%st
DB 255 ; (bad)
+ DB 255,199 ; inc %edi
DB 255 ; (bad)
DB 255 ; (bad)
- DB 191,255,255,255,180 ; mov $0xb4ffffff,%edi
+ DB 255,182,255,255,255,170 ; pushq -0x55000001(%rsi)
DB 255 ; (bad)
DB 255 ; (bad)
- DB 255,162,255,255,255,146 ; jmpq *-0x6d000001(%rdx)
+ DB 255,149,255,255,255,132 ; callq *-0x7b000001(%rbp)
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; .byte 0xff
@@ -26464,7 +26460,7 @@ _sk_store_8888_ssse3_lowp LABEL PROC
DB 102,69,15,97,194 ; punpcklwd %xmm10,%xmm8
DB 102,69,15,105,202 ; punpckhwd %xmm10,%xmm9
DB 77,133,192 ; test %r8,%r8
- DB 117,17 ; jne 2e7 <_sk_store_8888_ssse3_lowp+0x6f>
+ DB 117,17 ; jne 2eb <_sk_store_8888_ssse3_lowp+0x6f>
DB 243,69,15,127,76,147,16 ; movdqu %xmm9,0x10(%r11,%rdx,4)
DB 243,69,15,127,4,147 ; movdqu %xmm8,(%r11,%rdx,4)
DB 72,173 ; lods %ds:(%rsi),%rax
@@ -26473,9 +26469,9 @@ _sk_store_8888_ssse3_lowp LABEL PROC
DB 65,128,225,7 ; and $0x7,%r9b
DB 65,254,201 ; dec %r9b
DB 65,128,249,6 ; cmp $0x6,%r9b
- DB 119,236 ; ja 2e3 <_sk_store_8888_ssse3_lowp+0x6b>
+ DB 119,236 ; ja 2e7 <_sk_store_8888_ssse3_lowp+0x6b>
DB 69,15,182,201 ; movzbl %r9b,%r9d
- DB 76,141,21,90,0,0,0 ; lea 0x5a(%rip),%r10 # 35c <_sk_store_8888_ssse3_lowp+0xe4>
+ DB 76,141,21,90,0,0,0 ; lea 0x5a(%rip),%r10 # 360 <_sk_store_8888_ssse3_lowp+0xe4>
DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax
DB 76,1,208 ; add %r10,%rax
DB 255,224 ; jmpq *%rax
@@ -26491,7 +26487,7 @@ _sk_store_8888_ssse3_lowp LABEL PROC
DB 102,69,15,112,200,229 ; pshufd $0xe5,%xmm8,%xmm9
DB 102,69,15,126,76,147,4 ; movd %xmm9,0x4(%r11,%rdx,4)
DB 102,69,15,126,4,147 ; movd %xmm8,(%r11,%rdx,4)
- DB 235,136 ; jmp 2e3 <_sk_store_8888_ssse3_lowp+0x6b>
+ DB 235,136 ; jmp 2e7 <_sk_store_8888_ssse3_lowp+0x6b>
DB 144 ; nop
DB 247,255 ; idiv %edi
DB 255 ; (bad)
@@ -26523,6 +26519,12 @@ _sk_swap_rb_ssse3_lowp LABEL PROC
DB 255,224 ; jmpq *%rax
ALIGN 16
+ DB 0,4,8 ; add %al,(%rax,%rcx,1)
+ DB 12,1 ; or $0x1,%al
+ DB 5,9,13,2,6 ; add $0x6020d09,%eax
+ DB 10,14 ; or (%rsi),%cl
+ DB 3,7 ; add (%rdi),%eax
+ DB 11,15 ; or (%rdi),%ecx
DB 129,128,129,128,129,128,129,128,129,128; addl $0x80818081,-0x7f7e7f7f(%rax)
DB 129 ; .byte 0x81
DB 128 ; .byte 0x80
diff --git a/src/jumper/SkJumper_stages_lowp.cpp b/src/jumper/SkJumper_stages_lowp.cpp
index 81520f16ab..4d0df94af6 100644
--- a/src/jumper/SkJumper_stages_lowp.cpp
+++ b/src/jumper/SkJumper_stages_lowp.cpp
@@ -121,14 +121,13 @@ SI void from_8888(U32 rgba, F* r, F* g, F* b, F* a) {
U16 lo = unaligned_load<U16>((const uint32_t*)&rgba + 0),
hi = unaligned_load<U16>((const uint32_t*)&rgba + 4);
- U16 _0415 = _mm_unpacklo_epi8(lo, hi), // r0 r4 g0 g4 b0 b4 a0 a4 r1 r5 g1 g5 b1 b5 a1 a5
- _2637 = _mm_unpackhi_epi8(lo, hi);
+ // Shuffle so that the 4 bytes of each color channel are contiguous...
+ lo = _mm_shuffle_epi8(lo, _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15));
+ hi = _mm_shuffle_epi8(hi, _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15));
- U16 even = _mm_unpacklo_epi8(_0415, _2637), // r0 r2 r4 r6 g0 g2 g4 g6 b0 b2 b4 b6 a0 a2 a4 a6
- odd = _mm_unpackhi_epi8(_0415, _2637);
-
- U16 rg = _mm_unpacklo_epi8(even, odd), // r0 r1 r2 r3 r4 r5 r6 r7 g0 g1 g2 g3 g4 g5 g6 g7
- ba = _mm_unpackhi_epi8(even, odd);
+ // ...then get all 8 bytes of each color channel together into a single register.
+ U16 rg = _mm_unpacklo_epi32(lo,hi),
+ ba = _mm_unpackhi_epi32(lo,hi);
// Unpack as 16-bit values into the high half of each 16-bit lane, to get a free *256.
U16 R = _mm_unpacklo_epi8(U16(0), rg),