diff options
author | Mike Klein <mtklein@chromium.org> | 2017-04-04 10:24:56 -0400 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2017-04-04 17:29:38 +0000 |
commit | 95f53be0059940da50d4fce10da5c4dcf037b6ae (patch) | |
tree | 9ae1fcc979936cf72f4f9757cbd48fdb84dbfbae /src | |
parent | 744808823f635c863d7ca6b4eba652115c92ff85 (diff) |
jumper, split store_f16 into to_half, store4
Pretty much the same deal as the last CL going the other direction:
split store_f16 into to_half() and store4(). Platforms that had fused
strategies here get a little less optimal, but the code's easier to
follow, maintain, and reuse.
Also adds widen_cast() to encapsulate the fairly common pattern of
expanding one of our logical vector types (e.g. 8-byte U16) up to the
width of the physical vector type (e.g. 16-byte __m128i). This
operation is deeply understood by Clang, and often is a no-op.
I could make bit_cast() do this, but it seems clearer to have two names.
Change-Id: I7ba5bb4746acfcaa6d486379f67e07baee3820b2
Reviewed-on: https://skia-review.googlesource.com/11204
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src')
-rw-r--r-- | src/jumper/SkJumper_generated.S | 168 | ||||
-rw-r--r-- | src/jumper/SkJumper_generated_win.S | 146 | ||||
-rw-r--r-- | src/jumper/SkJumper_misc.h | 8 | ||||
-rw-r--r-- | src/jumper/SkJumper_stages.cpp | 115 | ||||
-rw-r--r-- | src/jumper/SkJumper_vectors.h | 90 |
5 files changed, 262 insertions, 265 deletions
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S index bf724d28f9..db82770f4c 100644 --- a/src/jumper/SkJumper_generated.S +++ b/src/jumper/SkJumper_generated.S @@ -3139,18 +3139,19 @@ _sk_load_f16_vfp4: HIDDEN _sk_store_f16_vfp4 .globl _sk_store_f16_vfp4 _sk_store_f16_vfp4: - .long 0xeef00b41 // vmov.f64 d16, d1 - .long 0xeef03b42 // vmov.f64 d19, d2 - .long 0xf2631113 // vorr d17, d3, d3 - .long 0xf2602110 // vorr d18, d0, d0 - .long 0xf3fa00a1 // vtrn.32 d16, d17 - .long 0xf3f61620 // vcvt.f16.f32 d17, q8 - .long 0xf3fa20a3 // vtrn.32 d18, d19 + .long 0xf2630113 // vorr d16, d3, d3 .long 0xe5913000 // ldr r3, [r1] - .long 0xf3f60622 // vcvt.f16.f32 d16, q9 + .long 0xf2612111 // vorr d18, d1, d1 + .long 0xf3f67620 // vcvt.f16.f32 d23, q8 .long 0xe5933000 // ldr r3, [r3] + .long 0xf3f66602 // vcvt.f16.f32 d22, q1 .long 0xe0833180 // add r3, r3, r0, lsl #3 - .long 0xf443084f // vst2.16 {d16-d17}, [r3] + .long 0xf3f65622 // vcvt.f16.f32 d21, q9 + .long 0xf3f64600 // vcvt.f16.f32 d20, q0 + .long 0xf22211b2 // vorr d1, d18, d18 + .long 0xf22031b0 // vorr d3, d16, d16 + .long 0xf4c3470d // vst4.16 {d20[0],d21[0],d22[0],d23[0]}, [r3]! + .long 0xf4c3474f // vst4.16 {d20[1],d21[1],d22[1],d23[1]}, [r3] .long 0xe2813008 // add r3, r1, #8 .long 0xe591c004 // ldr ip, [r1, #4] .long 0xe1a01003 // mov r1, r3 @@ -3193,7 +3194,6 @@ _sk_clamp_y_vfp4: .long 0xf26218a1 // vadd.i32 d17, d18, d17 .long 0xf2201fa1 // vmin.f32 d1, d16, d17 .long 0xe12fff1c // bx ip - .long 0xe320f000 // nop {0} HIDDEN _sk_repeat_x_vfp4 .globl _sk_repeat_x_vfp4 @@ -6907,7 +6907,7 @@ _sk_lerp_565_avx: .byte 255 // (bad) .byte 255 // (bad) .byte 255 // (bad) - .byte 233,255,255,255,225 // jmpq ffffffffe2001208 <_sk_linear_gradient_2stops_avx+0xffffffffe1ffeb0f> + .byte 233,255,255,255,225 // jmpq ffffffffe2001208 <_sk_linear_gradient_2stops_avx+0xffffffffe1ffeb13> .byte 255 // (bad) .byte 255 // (bad) .byte 255 // (bad) @@ -7876,32 +7876,32 @@ _sk_store_f16_avx: .byte 196,67,125,25,202,1 // vextractf128 $0x1,%ymm9,%xmm10 .byte 196,193,41,114,210,13 // vpsrld $0xd,%xmm10,%xmm10 .byte 196,193,49,114,209,13 // vpsrld $0xd,%xmm9,%xmm9 - .byte 197,60,89,217 // vmulps %ymm1,%ymm8,%ymm11 + .byte 196,66,49,43,202 // vpackusdw %xmm10,%xmm9,%xmm9 + .byte 197,60,89,209 // vmulps %ymm1,%ymm8,%ymm10 + .byte 196,67,125,25,211,1 // vextractf128 $0x1,%ymm10,%xmm11 + .byte 196,193,33,114,211,13 // vpsrld $0xd,%xmm11,%xmm11 + .byte 196,193,41,114,210,13 // vpsrld $0xd,%xmm10,%xmm10 + .byte 196,66,41,43,211 // vpackusdw %xmm11,%xmm10,%xmm10 + .byte 197,60,89,218 // vmulps %ymm2,%ymm8,%ymm11 .byte 196,67,125,25,220,1 // vextractf128 $0x1,%ymm11,%xmm12 .byte 196,193,25,114,212,13 // vpsrld $0xd,%xmm12,%xmm12 .byte 196,193,33,114,211,13 // vpsrld $0xd,%xmm11,%xmm11 - .byte 197,60,89,234 // vmulps %ymm2,%ymm8,%ymm13 - .byte 196,67,125,25,238,1 // vextractf128 $0x1,%ymm13,%xmm14 - .byte 196,193,9,114,214,13 // vpsrld $0xd,%xmm14,%xmm14 - .byte 196,193,17,114,213,13 // vpsrld $0xd,%xmm13,%xmm13 + .byte 196,66,33,43,220 // vpackusdw %xmm12,%xmm11,%xmm11 .byte 197,60,89,195 // vmulps %ymm3,%ymm8,%ymm8 - .byte 196,67,125,25,199,1 // vextractf128 $0x1,%ymm8,%xmm15 - .byte 196,193,1,114,215,13 // vpsrld $0xd,%xmm15,%xmm15 + .byte 196,67,125,25,196,1 // vextractf128 $0x1,%ymm8,%xmm12 + .byte 196,193,25,114,212,13 // vpsrld $0xd,%xmm12,%xmm12 .byte 196,193,57,114,208,13 // vpsrld $0xd,%xmm8,%xmm8 - .byte 196,193,33,115,251,2 // vpslldq $0x2,%xmm11,%xmm11 - .byte 196,65,33,235,201 // vpor %xmm9,%xmm11,%xmm9 - .byte 196,193,33,115,252,2 // vpslldq $0x2,%xmm12,%xmm11 - .byte 196,65,33,235,226 // vpor %xmm10,%xmm11,%xmm12 - .byte 196,193,57,115,248,2 // vpslldq $0x2,%xmm8,%xmm8 - .byte 196,65,57,235,197 // vpor %xmm13,%xmm8,%xmm8 - .byte 196,193,41,115,255,2 // vpslldq $0x2,%xmm15,%xmm10 - .byte 196,65,41,235,238 // vpor %xmm14,%xmm10,%xmm13 - .byte 196,65,49,98,216 // vpunpckldq %xmm8,%xmm9,%xmm11 - .byte 196,65,49,106,208 // vpunpckhdq %xmm8,%xmm9,%xmm10 - .byte 196,65,25,98,205 // vpunpckldq %xmm13,%xmm12,%xmm9 - .byte 196,65,25,106,197 // vpunpckhdq %xmm13,%xmm12,%xmm8 + .byte 196,66,57,43,196 // vpackusdw %xmm12,%xmm8,%xmm8 + .byte 196,65,49,97,226 // vpunpcklwd %xmm10,%xmm9,%xmm12 + .byte 196,65,49,105,234 // vpunpckhwd %xmm10,%xmm9,%xmm13 + .byte 196,65,33,97,200 // vpunpcklwd %xmm8,%xmm11,%xmm9 + .byte 196,65,33,105,192 // vpunpckhwd %xmm8,%xmm11,%xmm8 + .byte 196,65,25,98,217 // vpunpckldq %xmm9,%xmm12,%xmm11 + .byte 196,65,25,106,209 // vpunpckhdq %xmm9,%xmm12,%xmm10 + .byte 196,65,17,98,200 // vpunpckldq %xmm8,%xmm13,%xmm9 + .byte 196,65,17,106,192 // vpunpckhdq %xmm8,%xmm13,%xmm8 .byte 72,133,201 // test %rcx,%rcx - .byte 117,31 // jne 2142 <_sk_store_f16_avx+0xd6> + .byte 117,31 // jne 213e <_sk_store_f16_avx+0xd2> .byte 196,65,120,17,28,248 // vmovups %xmm11,(%r8,%rdi,8) .byte 196,65,120,17,84,248,16 // vmovups %xmm10,0x10(%r8,%rdi,8) .byte 196,65,120,17,76,248,32 // vmovups %xmm9,0x20(%r8,%rdi,8) @@ -7910,22 +7910,22 @@ _sk_store_f16_avx: .byte 255,224 // jmpq *%rax .byte 196,65,121,214,28,248 // vmovq %xmm11,(%r8,%rdi,8) .byte 72,131,249,1 // cmp $0x1,%rcx - .byte 116,240 // je 213e <_sk_store_f16_avx+0xd2> + .byte 116,240 // je 213a <_sk_store_f16_avx+0xce> .byte 196,65,121,23,92,248,8 // vmovhpd %xmm11,0x8(%r8,%rdi,8) .byte 72,131,249,3 // cmp $0x3,%rcx - .byte 114,227 // jb 213e <_sk_store_f16_avx+0xd2> + .byte 114,227 // jb 213a <_sk_store_f16_avx+0xce> .byte 196,65,121,214,84,248,16 // vmovq %xmm10,0x10(%r8,%rdi,8) - .byte 116,218 // je 213e <_sk_store_f16_avx+0xd2> + .byte 116,218 // je 213a <_sk_store_f16_avx+0xce> .byte 196,65,121,23,84,248,24 // vmovhpd %xmm10,0x18(%r8,%rdi,8) .byte 72,131,249,5 // cmp $0x5,%rcx - .byte 114,205 // jb 213e <_sk_store_f16_avx+0xd2> + .byte 114,205 // jb 213a <_sk_store_f16_avx+0xce> .byte 196,65,121,214,76,248,32 // vmovq %xmm9,0x20(%r8,%rdi,8) - .byte 116,196 // je 213e <_sk_store_f16_avx+0xd2> + .byte 116,196 // je 213a <_sk_store_f16_avx+0xce> .byte 196,65,121,23,76,248,40 // vmovhpd %xmm9,0x28(%r8,%rdi,8) .byte 72,131,249,7 // cmp $0x7,%rcx - .byte 114,183 // jb 213e <_sk_store_f16_avx+0xd2> + .byte 114,183 // jb 213a <_sk_store_f16_avx+0xce> .byte 196,65,121,214,68,248,48 // vmovq %xmm8,0x30(%r8,%rdi,8) - .byte 235,174 // jmp 213e <_sk_store_f16_avx+0xd2> + .byte 235,174 // jmp 213a <_sk_store_f16_avx+0xce> HIDDEN _sk_store_f32_avx .globl _sk_store_f32_avx @@ -7942,7 +7942,7 @@ _sk_store_f32_avx: .byte 196,65,37,20,196 // vunpcklpd %ymm12,%ymm11,%ymm8 .byte 196,65,37,21,220 // vunpckhpd %ymm12,%ymm11,%ymm11 .byte 72,133,201 // test %rcx,%rcx - .byte 117,55 // jne 21fd <_sk_store_f32_avx+0x6d> + .byte 117,55 // jne 21f9 <_sk_store_f32_avx+0x6d> .byte 196,67,45,24,225,1 // vinsertf128 $0x1,%xmm9,%ymm10,%ymm12 .byte 196,67,61,24,235,1 // vinsertf128 $0x1,%xmm11,%ymm8,%ymm13 .byte 196,67,45,6,201,49 // vperm2f128 $0x31,%ymm9,%ymm10,%ymm9 @@ -7955,22 +7955,22 @@ _sk_store_f32_avx: .byte 255,224 // jmpq *%rax .byte 196,65,121,17,20,128 // vmovupd %xmm10,(%r8,%rax,4) .byte 72,131,249,1 // cmp $0x1,%rcx - .byte 116,240 // je 21f9 <_sk_store_f32_avx+0x69> + .byte 116,240 // je 21f5 <_sk_store_f32_avx+0x69> .byte 196,65,121,17,76,128,16 // vmovupd %xmm9,0x10(%r8,%rax,4) .byte 72,131,249,3 // cmp $0x3,%rcx - .byte 114,227 // jb 21f9 <_sk_store_f32_avx+0x69> + .byte 114,227 // jb 21f5 <_sk_store_f32_avx+0x69> .byte 196,65,121,17,68,128,32 // vmovupd %xmm8,0x20(%r8,%rax,4) - .byte 116,218 // je 21f9 <_sk_store_f32_avx+0x69> + .byte 116,218 // je 21f5 <_sk_store_f32_avx+0x69> .byte 196,65,121,17,92,128,48 // vmovupd %xmm11,0x30(%r8,%rax,4) .byte 72,131,249,5 // cmp $0x5,%rcx - .byte 114,205 // jb 21f9 <_sk_store_f32_avx+0x69> + .byte 114,205 // jb 21f5 <_sk_store_f32_avx+0x69> .byte 196,67,125,25,84,128,64,1 // vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4) - .byte 116,195 // je 21f9 <_sk_store_f32_avx+0x69> + .byte 116,195 // je 21f5 <_sk_store_f32_avx+0x69> .byte 196,67,125,25,76,128,80,1 // vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4) .byte 72,131,249,7 // cmp $0x7,%rcx - .byte 114,181 // jb 21f9 <_sk_store_f32_avx+0x69> + .byte 114,181 // jb 21f5 <_sk_store_f32_avx+0x69> .byte 196,67,125,25,68,128,96,1 // vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4) - .byte 235,171 // jmp 21f9 <_sk_store_f32_avx+0x69> + .byte 235,171 // jmp 21f5 <_sk_store_f32_avx+0x69> HIDDEN _sk_clamp_x_avx .globl _sk_clamp_x_avx @@ -10038,27 +10038,29 @@ _sk_store_f16_sse41: .byte 72,139,0 // mov (%rax),%rax .byte 185,0,0,128,7 // mov $0x7800000,%ecx .byte 102,68,15,110,193 // movd %ecx,%xmm8 - .byte 102,69,15,112,192,0 // pshufd $0x0,%xmm8,%xmm8 - .byte 102,69,15,111,200 // movdqa %xmm8,%xmm9 - .byte 68,15,89,200 // mulps %xmm0,%xmm9 - .byte 102,65,15,114,209,13 // psrld $0xd,%xmm9 - .byte 102,69,15,111,208 // movdqa %xmm8,%xmm10 + .byte 102,69,15,112,200,0 // pshufd $0x0,%xmm8,%xmm9 + .byte 102,69,15,111,193 // movdqa %xmm9,%xmm8 + .byte 68,15,89,192 // mulps %xmm0,%xmm8 + .byte 102,65,15,114,208,13 // psrld $0xd,%xmm8 + .byte 102,69,15,56,43,192 // packusdw %xmm8,%xmm8 + .byte 102,69,15,111,209 // movdqa %xmm9,%xmm10 .byte 68,15,89,209 // mulps %xmm1,%xmm10 .byte 102,65,15,114,210,13 // psrld $0xd,%xmm10 - .byte 102,69,15,111,216 // movdqa %xmm8,%xmm11 + .byte 102,69,15,56,43,210 // packusdw %xmm10,%xmm10 + .byte 102,69,15,111,217 // movdqa %xmm9,%xmm11 .byte 68,15,89,218 // mulps %xmm2,%xmm11 .byte 102,65,15,114,211,13 // psrld $0xd,%xmm11 - .byte 68,15,89,195 // mulps %xmm3,%xmm8 - .byte 102,65,15,114,208,13 // psrld $0xd,%xmm8 - .byte 102,65,15,115,250,2 // pslldq $0x2,%xmm10 - .byte 102,69,15,235,209 // por %xmm9,%xmm10 - .byte 102,65,15,115,248,2 // pslldq $0x2,%xmm8 - .byte 102,69,15,235,195 // por %xmm11,%xmm8 - .byte 102,69,15,111,202 // movdqa %xmm10,%xmm9 - .byte 102,69,15,98,200 // punpckldq %xmm8,%xmm9 + .byte 102,69,15,56,43,219 // packusdw %xmm11,%xmm11 + .byte 68,15,89,203 // mulps %xmm3,%xmm9 + .byte 102,65,15,114,209,13 // psrld $0xd,%xmm9 + .byte 102,69,15,56,43,201 // packusdw %xmm9,%xmm9 + .byte 102,69,15,97,194 // punpcklwd %xmm10,%xmm8 + .byte 102,69,15,97,217 // punpcklwd %xmm9,%xmm11 + .byte 102,69,15,111,200 // movdqa %xmm8,%xmm9 + .byte 102,69,15,98,203 // punpckldq %xmm11,%xmm9 .byte 243,68,15,127,12,248 // movdqu %xmm9,(%rax,%rdi,8) - .byte 102,69,15,106,208 // punpckhdq %xmm8,%xmm10 - .byte 243,68,15,127,84,248,16 // movdqu %xmm10,0x10(%rax,%rdi,8) + .byte 102,69,15,106,195 // punpckhdq %xmm11,%xmm8 + .byte 243,68,15,127,68,248,16 // movdqu %xmm8,0x10(%rax,%rdi,8) .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax @@ -12263,27 +12265,37 @@ _sk_store_f16_sse2: .byte 72,139,0 // mov (%rax),%rax .byte 185,0,0,128,7 // mov $0x7800000,%ecx .byte 102,68,15,110,193 // movd %ecx,%xmm8 - .byte 102,69,15,112,192,0 // pshufd $0x0,%xmm8,%xmm8 - .byte 102,69,15,111,200 // movdqa %xmm8,%xmm9 - .byte 68,15,89,200 // mulps %xmm0,%xmm9 - .byte 102,65,15,114,209,13 // psrld $0xd,%xmm9 - .byte 102,69,15,111,208 // movdqa %xmm8,%xmm10 + .byte 102,69,15,112,200,0 // pshufd $0x0,%xmm8,%xmm9 + .byte 102,69,15,111,193 // movdqa %xmm9,%xmm8 + .byte 68,15,89,192 // mulps %xmm0,%xmm8 + .byte 102,65,15,114,208,13 // psrld $0xd,%xmm8 + .byte 102,65,15,114,240,16 // pslld $0x10,%xmm8 + .byte 102,65,15,114,224,16 // psrad $0x10,%xmm8 + .byte 102,69,15,107,192 // packssdw %xmm8,%xmm8 + .byte 102,69,15,111,209 // movdqa %xmm9,%xmm10 .byte 68,15,89,209 // mulps %xmm1,%xmm10 .byte 102,65,15,114,210,13 // psrld $0xd,%xmm10 - .byte 102,69,15,111,216 // movdqa %xmm8,%xmm11 + .byte 102,65,15,114,242,16 // pslld $0x10,%xmm10 + .byte 102,65,15,114,226,16 // psrad $0x10,%xmm10 + .byte 102,69,15,107,210 // packssdw %xmm10,%xmm10 + .byte 102,69,15,111,217 // movdqa %xmm9,%xmm11 .byte 68,15,89,218 // mulps %xmm2,%xmm11 .byte 102,65,15,114,211,13 // psrld $0xd,%xmm11 - .byte 68,15,89,195 // mulps %xmm3,%xmm8 - .byte 102,65,15,114,208,13 // psrld $0xd,%xmm8 - .byte 102,65,15,115,250,2 // pslldq $0x2,%xmm10 - .byte 102,69,15,235,209 // por %xmm9,%xmm10 - .byte 102,65,15,115,248,2 // pslldq $0x2,%xmm8 - .byte 102,69,15,235,195 // por %xmm11,%xmm8 - .byte 102,69,15,111,202 // movdqa %xmm10,%xmm9 - .byte 102,69,15,98,200 // punpckldq %xmm8,%xmm9 + .byte 102,65,15,114,243,16 // pslld $0x10,%xmm11 + .byte 102,65,15,114,227,16 // psrad $0x10,%xmm11 + .byte 102,69,15,107,219 // packssdw %xmm11,%xmm11 + .byte 68,15,89,203 // mulps %xmm3,%xmm9 + .byte 102,65,15,114,209,13 // psrld $0xd,%xmm9 + .byte 102,65,15,114,241,16 // pslld $0x10,%xmm9 + .byte 102,65,15,114,225,16 // psrad $0x10,%xmm9 + .byte 102,69,15,107,201 // packssdw %xmm9,%xmm9 + .byte 102,69,15,97,194 // punpcklwd %xmm10,%xmm8 + .byte 102,69,15,97,217 // punpcklwd %xmm9,%xmm11 + .byte 102,69,15,111,200 // movdqa %xmm8,%xmm9 + .byte 102,69,15,98,203 // punpckldq %xmm11,%xmm9 .byte 243,68,15,127,12,248 // movdqu %xmm9,(%rax,%rdi,8) - .byte 102,69,15,106,208 // punpckhdq %xmm8,%xmm10 - .byte 243,68,15,127,84,248,16 // movdqu %xmm10,0x10(%rax,%rdi,8) + .byte 102,69,15,106,195 // punpckhdq %xmm11,%xmm8 + .byte 243,68,15,127,68,248,16 // movdqu %xmm8,0x10(%rax,%rdi,8) .byte 72,173 // lods %ds:(%rsi),%rax .byte 255,224 // jmpq *%rax diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S index a25db7c396..a662394171 100644 --- a/src/jumper/SkJumper_generated_win.S +++ b/src/jumper/SkJumper_generated_win.S @@ -4286,32 +4286,32 @@ _sk_store_f16_avx LABEL PROC DB 196,67,125,25,202,1 ; vextractf128 $0x1,%ymm9,%xmm10 DB 196,193,41,114,210,13 ; vpsrld $0xd,%xmm10,%xmm10 DB 196,193,49,114,209,13 ; vpsrld $0xd,%xmm9,%xmm9 - DB 197,60,89,217 ; vmulps %ymm1,%ymm8,%ymm11 + DB 196,66,49,43,202 ; vpackusdw %xmm10,%xmm9,%xmm9 + DB 197,60,89,209 ; vmulps %ymm1,%ymm8,%ymm10 + DB 196,67,125,25,211,1 ; vextractf128 $0x1,%ymm10,%xmm11 + DB 196,193,33,114,211,13 ; vpsrld $0xd,%xmm11,%xmm11 + DB 196,193,41,114,210,13 ; vpsrld $0xd,%xmm10,%xmm10 + DB 196,66,41,43,211 ; vpackusdw %xmm11,%xmm10,%xmm10 + DB 197,60,89,218 ; vmulps %ymm2,%ymm8,%ymm11 DB 196,67,125,25,220,1 ; vextractf128 $0x1,%ymm11,%xmm12 DB 196,193,25,114,212,13 ; vpsrld $0xd,%xmm12,%xmm12 DB 196,193,33,114,211,13 ; vpsrld $0xd,%xmm11,%xmm11 - DB 197,60,89,234 ; vmulps %ymm2,%ymm8,%ymm13 - DB 196,67,125,25,238,1 ; vextractf128 $0x1,%ymm13,%xmm14 - DB 196,193,9,114,214,13 ; vpsrld $0xd,%xmm14,%xmm14 - DB 196,193,17,114,213,13 ; vpsrld $0xd,%xmm13,%xmm13 + DB 196,66,33,43,220 ; vpackusdw %xmm12,%xmm11,%xmm11 DB 197,60,89,195 ; vmulps %ymm3,%ymm8,%ymm8 - DB 196,67,125,25,199,1 ; vextractf128 $0x1,%ymm8,%xmm15 - DB 196,193,1,114,215,13 ; vpsrld $0xd,%xmm15,%xmm15 + DB 196,67,125,25,196,1 ; vextractf128 $0x1,%ymm8,%xmm12 + DB 196,193,25,114,212,13 ; vpsrld $0xd,%xmm12,%xmm12 DB 196,193,57,114,208,13 ; vpsrld $0xd,%xmm8,%xmm8 - DB 196,193,33,115,251,2 ; vpslldq $0x2,%xmm11,%xmm11 - DB 196,65,33,235,201 ; vpor %xmm9,%xmm11,%xmm9 - DB 196,193,33,115,252,2 ; vpslldq $0x2,%xmm12,%xmm11 - DB 196,65,33,235,226 ; vpor %xmm10,%xmm11,%xmm12 - DB 196,193,57,115,248,2 ; vpslldq $0x2,%xmm8,%xmm8 - DB 196,65,57,235,197 ; vpor %xmm13,%xmm8,%xmm8 - DB 196,193,41,115,255,2 ; vpslldq $0x2,%xmm15,%xmm10 - DB 196,65,41,235,238 ; vpor %xmm14,%xmm10,%xmm13 - DB 196,65,49,98,216 ; vpunpckldq %xmm8,%xmm9,%xmm11 - DB 196,65,49,106,208 ; vpunpckhdq %xmm8,%xmm9,%xmm10 - DB 196,65,25,98,205 ; vpunpckldq %xmm13,%xmm12,%xmm9 - DB 196,65,25,106,197 ; vpunpckhdq %xmm13,%xmm12,%xmm8 + DB 196,66,57,43,196 ; vpackusdw %xmm12,%xmm8,%xmm8 + DB 196,65,49,97,226 ; vpunpcklwd %xmm10,%xmm9,%xmm12 + DB 196,65,49,105,234 ; vpunpckhwd %xmm10,%xmm9,%xmm13 + DB 196,65,33,97,200 ; vpunpcklwd %xmm8,%xmm11,%xmm9 + DB 196,65,33,105,192 ; vpunpckhwd %xmm8,%xmm11,%xmm8 + DB 196,65,25,98,217 ; vpunpckldq %xmm9,%xmm12,%xmm11 + DB 196,65,25,106,209 ; vpunpckhdq %xmm9,%xmm12,%xmm10 + DB 196,65,17,98,200 ; vpunpckldq %xmm8,%xmm13,%xmm9 + DB 196,65,17,106,192 ; vpunpckhdq %xmm8,%xmm13,%xmm8 DB 72,133,201 ; test %rcx,%rcx - DB 117,31 ; jne 21de <_sk_store_f16_avx+0xd6> + DB 117,31 ; jne 21da <_sk_store_f16_avx+0xd2> DB 196,65,120,17,28,248 ; vmovups %xmm11,(%r8,%rdi,8) DB 196,65,120,17,84,248,16 ; vmovups %xmm10,0x10(%r8,%rdi,8) DB 196,65,120,17,76,248,32 ; vmovups %xmm9,0x20(%r8,%rdi,8) @@ -4320,22 +4320,22 @@ _sk_store_f16_avx LABEL PROC DB 255,224 ; jmpq *%rax DB 196,65,121,214,28,248 ; vmovq %xmm11,(%r8,%rdi,8) DB 72,131,249,1 ; cmp $0x1,%rcx - DB 116,240 ; je 21da <_sk_store_f16_avx+0xd2> + DB 116,240 ; je 21d6 <_sk_store_f16_avx+0xce> DB 196,65,121,23,92,248,8 ; vmovhpd %xmm11,0x8(%r8,%rdi,8) DB 72,131,249,3 ; cmp $0x3,%rcx - DB 114,227 ; jb 21da <_sk_store_f16_avx+0xd2> + DB 114,227 ; jb 21d6 <_sk_store_f16_avx+0xce> DB 196,65,121,214,84,248,16 ; vmovq %xmm10,0x10(%r8,%rdi,8) - DB 116,218 ; je 21da <_sk_store_f16_avx+0xd2> + DB 116,218 ; je 21d6 <_sk_store_f16_avx+0xce> DB 196,65,121,23,84,248,24 ; vmovhpd %xmm10,0x18(%r8,%rdi,8) DB 72,131,249,5 ; cmp $0x5,%rcx - DB 114,205 ; jb 21da <_sk_store_f16_avx+0xd2> + DB 114,205 ; jb 21d6 <_sk_store_f16_avx+0xce> DB 196,65,121,214,76,248,32 ; vmovq %xmm9,0x20(%r8,%rdi,8) - DB 116,196 ; je 21da <_sk_store_f16_avx+0xd2> + DB 116,196 ; je 21d6 <_sk_store_f16_avx+0xce> DB 196,65,121,23,76,248,40 ; vmovhpd %xmm9,0x28(%r8,%rdi,8) DB 72,131,249,7 ; cmp $0x7,%rcx - DB 114,183 ; jb 21da <_sk_store_f16_avx+0xd2> + DB 114,183 ; jb 21d6 <_sk_store_f16_avx+0xce> DB 196,65,121,214,68,248,48 ; vmovq %xmm8,0x30(%r8,%rdi,8) - DB 235,174 ; jmp 21da <_sk_store_f16_avx+0xd2> + DB 235,174 ; jmp 21d6 <_sk_store_f16_avx+0xce> PUBLIC _sk_store_f32_avx _sk_store_f32_avx LABEL PROC @@ -4351,7 +4351,7 @@ _sk_store_f32_avx LABEL PROC DB 196,65,37,20,196 ; vunpcklpd %ymm12,%ymm11,%ymm8 DB 196,65,37,21,220 ; vunpckhpd %ymm12,%ymm11,%ymm11 DB 72,133,201 ; test %rcx,%rcx - DB 117,55 ; jne 2299 <_sk_store_f32_avx+0x6d> + DB 117,55 ; jne 2295 <_sk_store_f32_avx+0x6d> DB 196,67,45,24,225,1 ; vinsertf128 $0x1,%xmm9,%ymm10,%ymm12 DB 196,67,61,24,235,1 ; vinsertf128 $0x1,%xmm11,%ymm8,%ymm13 DB 196,67,45,6,201,49 ; vperm2f128 $0x31,%ymm9,%ymm10,%ymm9 @@ -4364,22 +4364,22 @@ _sk_store_f32_avx LABEL PROC DB 255,224 ; jmpq *%rax DB 196,65,121,17,20,128 ; vmovupd %xmm10,(%r8,%rax,4) DB 72,131,249,1 ; cmp $0x1,%rcx - DB 116,240 ; je 2295 <_sk_store_f32_avx+0x69> + DB 116,240 ; je 2291 <_sk_store_f32_avx+0x69> DB 196,65,121,17,76,128,16 ; vmovupd %xmm9,0x10(%r8,%rax,4) DB 72,131,249,3 ; cmp $0x3,%rcx - DB 114,227 ; jb 2295 <_sk_store_f32_avx+0x69> + DB 114,227 ; jb 2291 <_sk_store_f32_avx+0x69> DB 196,65,121,17,68,128,32 ; vmovupd %xmm8,0x20(%r8,%rax,4) - DB 116,218 ; je 2295 <_sk_store_f32_avx+0x69> + DB 116,218 ; je 2291 <_sk_store_f32_avx+0x69> DB 196,65,121,17,92,128,48 ; vmovupd %xmm11,0x30(%r8,%rax,4) DB 72,131,249,5 ; cmp $0x5,%rcx - DB 114,205 ; jb 2295 <_sk_store_f32_avx+0x69> + DB 114,205 ; jb 2291 <_sk_store_f32_avx+0x69> DB 196,67,125,25,84,128,64,1 ; vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4) - DB 116,195 ; je 2295 <_sk_store_f32_avx+0x69> + DB 116,195 ; je 2291 <_sk_store_f32_avx+0x69> DB 196,67,125,25,76,128,80,1 ; vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4) DB 72,131,249,7 ; cmp $0x7,%rcx - DB 114,181 ; jb 2295 <_sk_store_f32_avx+0x69> + DB 114,181 ; jb 2291 <_sk_store_f32_avx+0x69> DB 196,67,125,25,68,128,96,1 ; vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4) - DB 235,171 ; jmp 2295 <_sk_store_f32_avx+0x69> + DB 235,171 ; jmp 2291 <_sk_store_f32_avx+0x69> PUBLIC _sk_clamp_x_avx _sk_clamp_x_avx LABEL PROC @@ -6412,27 +6412,29 @@ _sk_store_f16_sse41 LABEL PROC DB 72,139,0 ; mov (%rax),%rax DB 185,0,0,128,7 ; mov $0x7800000,%ecx DB 102,68,15,110,193 ; movd %ecx,%xmm8 - DB 102,69,15,112,192,0 ; pshufd $0x0,%xmm8,%xmm8 - DB 102,69,15,111,200 ; movdqa %xmm8,%xmm9 - DB 68,15,89,200 ; mulps %xmm0,%xmm9 - DB 102,65,15,114,209,13 ; psrld $0xd,%xmm9 - DB 102,69,15,111,208 ; movdqa %xmm8,%xmm10 + DB 102,69,15,112,200,0 ; pshufd $0x0,%xmm8,%xmm9 + DB 102,69,15,111,193 ; movdqa %xmm9,%xmm8 + DB 68,15,89,192 ; mulps %xmm0,%xmm8 + DB 102,65,15,114,208,13 ; psrld $0xd,%xmm8 + DB 102,69,15,56,43,192 ; packusdw %xmm8,%xmm8 + DB 102,69,15,111,209 ; movdqa %xmm9,%xmm10 DB 68,15,89,209 ; mulps %xmm1,%xmm10 DB 102,65,15,114,210,13 ; psrld $0xd,%xmm10 - DB 102,69,15,111,216 ; movdqa %xmm8,%xmm11 + DB 102,69,15,56,43,210 ; packusdw %xmm10,%xmm10 + DB 102,69,15,111,217 ; movdqa %xmm9,%xmm11 DB 68,15,89,218 ; mulps %xmm2,%xmm11 DB 102,65,15,114,211,13 ; psrld $0xd,%xmm11 - DB 68,15,89,195 ; mulps %xmm3,%xmm8 - DB 102,65,15,114,208,13 ; psrld $0xd,%xmm8 - DB 102,65,15,115,250,2 ; pslldq $0x2,%xmm10 - DB 102,69,15,235,209 ; por %xmm9,%xmm10 - DB 102,65,15,115,248,2 ; pslldq $0x2,%xmm8 - DB 102,69,15,235,195 ; por %xmm11,%xmm8 - DB 102,69,15,111,202 ; movdqa %xmm10,%xmm9 - DB 102,69,15,98,200 ; punpckldq %xmm8,%xmm9 + DB 102,69,15,56,43,219 ; packusdw %xmm11,%xmm11 + DB 68,15,89,203 ; mulps %xmm3,%xmm9 + DB 102,65,15,114,209,13 ; psrld $0xd,%xmm9 + DB 102,69,15,56,43,201 ; packusdw %xmm9,%xmm9 + DB 102,69,15,97,194 ; punpcklwd %xmm10,%xmm8 + DB 102,69,15,97,217 ; punpcklwd %xmm9,%xmm11 + DB 102,69,15,111,200 ; movdqa %xmm8,%xmm9 + DB 102,69,15,98,203 ; punpckldq %xmm11,%xmm9 DB 243,68,15,127,12,248 ; movdqu %xmm9,(%rax,%rdi,8) - DB 102,69,15,106,208 ; punpckhdq %xmm8,%xmm10 - DB 243,68,15,127,84,248,16 ; movdqu %xmm10,0x10(%rax,%rdi,8) + DB 102,69,15,106,195 ; punpckhdq %xmm11,%xmm8 + DB 243,68,15,127,68,248,16 ; movdqu %xmm8,0x10(%rax,%rdi,8) DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax @@ -8599,27 +8601,37 @@ _sk_store_f16_sse2 LABEL PROC DB 72,139,0 ; mov (%rax),%rax DB 185,0,0,128,7 ; mov $0x7800000,%ecx DB 102,68,15,110,193 ; movd %ecx,%xmm8 - DB 102,69,15,112,192,0 ; pshufd $0x0,%xmm8,%xmm8 - DB 102,69,15,111,200 ; movdqa %xmm8,%xmm9 - DB 68,15,89,200 ; mulps %xmm0,%xmm9 - DB 102,65,15,114,209,13 ; psrld $0xd,%xmm9 - DB 102,69,15,111,208 ; movdqa %xmm8,%xmm10 + DB 102,69,15,112,200,0 ; pshufd $0x0,%xmm8,%xmm9 + DB 102,69,15,111,193 ; movdqa %xmm9,%xmm8 + DB 68,15,89,192 ; mulps %xmm0,%xmm8 + DB 102,65,15,114,208,13 ; psrld $0xd,%xmm8 + DB 102,65,15,114,240,16 ; pslld $0x10,%xmm8 + DB 102,65,15,114,224,16 ; psrad $0x10,%xmm8 + DB 102,69,15,107,192 ; packssdw %xmm8,%xmm8 + DB 102,69,15,111,209 ; movdqa %xmm9,%xmm10 DB 68,15,89,209 ; mulps %xmm1,%xmm10 DB 102,65,15,114,210,13 ; psrld $0xd,%xmm10 - DB 102,69,15,111,216 ; movdqa %xmm8,%xmm11 + DB 102,65,15,114,242,16 ; pslld $0x10,%xmm10 + DB 102,65,15,114,226,16 ; psrad $0x10,%xmm10 + DB 102,69,15,107,210 ; packssdw %xmm10,%xmm10 + DB 102,69,15,111,217 ; movdqa %xmm9,%xmm11 DB 68,15,89,218 ; mulps %xmm2,%xmm11 DB 102,65,15,114,211,13 ; psrld $0xd,%xmm11 - DB 68,15,89,195 ; mulps %xmm3,%xmm8 - DB 102,65,15,114,208,13 ; psrld $0xd,%xmm8 - DB 102,65,15,115,250,2 ; pslldq $0x2,%xmm10 - DB 102,69,15,235,209 ; por %xmm9,%xmm10 - DB 102,65,15,115,248,2 ; pslldq $0x2,%xmm8 - DB 102,69,15,235,195 ; por %xmm11,%xmm8 - DB 102,69,15,111,202 ; movdqa %xmm10,%xmm9 - DB 102,69,15,98,200 ; punpckldq %xmm8,%xmm9 + DB 102,65,15,114,243,16 ; pslld $0x10,%xmm11 + DB 102,65,15,114,227,16 ; psrad $0x10,%xmm11 + DB 102,69,15,107,219 ; packssdw %xmm11,%xmm11 + DB 68,15,89,203 ; mulps %xmm3,%xmm9 + DB 102,65,15,114,209,13 ; psrld $0xd,%xmm9 + DB 102,65,15,114,241,16 ; pslld $0x10,%xmm9 + DB 102,65,15,114,225,16 ; psrad $0x10,%xmm9 + DB 102,69,15,107,201 ; packssdw %xmm9,%xmm9 + DB 102,69,15,97,194 ; punpcklwd %xmm10,%xmm8 + DB 102,69,15,97,217 ; punpcklwd %xmm9,%xmm11 + DB 102,69,15,111,200 ; movdqa %xmm8,%xmm9 + DB 102,69,15,98,203 ; punpckldq %xmm11,%xmm9 DB 243,68,15,127,12,248 ; movdqu %xmm9,(%rax,%rdi,8) - DB 102,69,15,106,208 ; punpckhdq %xmm8,%xmm10 - DB 243,68,15,127,84,248,16 ; movdqu %xmm10,0x10(%rax,%rdi,8) + DB 102,69,15,106,195 ; punpckhdq %xmm11,%xmm8 + DB 243,68,15,127,68,248,16 ; movdqu %xmm8,0x10(%rax,%rdi,8) DB 72,173 ; lods %ds:(%rsi),%rax DB 255,224 ; jmpq *%rax diff --git a/src/jumper/SkJumper_misc.h b/src/jumper/SkJumper_misc.h index 96035bd084..54e957ad6e 100644 --- a/src/jumper/SkJumper_misc.h +++ b/src/jumper/SkJumper_misc.h @@ -28,6 +28,14 @@ SI Dst bit_cast(const Src& src) { return unaligned_load<Dst>(&src); } +template <typename Dst, typename Src> +SI Dst widen_cast(const Src& src) { + static_assert(sizeof(Dst) > sizeof(Src), ""); + Dst dst; + memcpy(&dst, &src, sizeof(Src)); + return dst; +} + // A couple functions for embedding constants directly into code, // so that no .const or .literal4 section is created. SI int C(int x) { diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index dd2bb1348f..fa64e805d6 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -634,117 +634,10 @@ STAGE(load_f16) { STAGE(store_f16) { auto ptr = *(uint64_t**)ctx + x; -#if !defined(JUMPER) - auto float_to_half = [&](F f) { - return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent, - >> 13; // then line up the mantissa. - }; - auto rgba = (int16_t*)ptr; - rgba[0] = float_to_half(r); - rgba[1] = float_to_half(g); - rgba[2] = float_to_half(b); - rgba[3] = float_to_half(a); -#elif defined(__aarch64__) - float16x4x4_t halfs = {{ - vcvt_f16_f32(r), - vcvt_f16_f32(g), - vcvt_f16_f32(b), - vcvt_f16_f32(a), - }}; - vst4_f16((float16_t*)ptr, halfs); -#elif defined(__arm__) - float16x4x2_t rb_ga = {{ - vcvt_f16_f32(float32x4_t{r[0], b[0], r[1], b[1]}), - vcvt_f16_f32(float32x4_t{g[0], a[0], g[1], a[1]}), - }}; - vst2_f16((float16_t*)ptr, rb_ga); -#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__) - auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION), - G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION), - B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION), - A = _mm256_cvtps_ph(a, _MM_FROUND_CUR_DIRECTION); - - auto rg0123 = _mm_unpacklo_epi16(R, G), // r0 g0 r1 g1 r2 g2 r3 g3 - rg4567 = _mm_unpackhi_epi16(R, G), // r4 g4 r5 g5 r6 g6 r7 g7 - ba0123 = _mm_unpacklo_epi16(B, A), - ba4567 = _mm_unpackhi_epi16(B, A); - - auto _01 = _mm_unpacklo_epi32(rg0123, ba0123), - _23 = _mm_unpackhi_epi32(rg0123, ba0123), - _45 = _mm_unpacklo_epi32(rg4567, ba4567), - _67 = _mm_unpackhi_epi32(rg4567, ba4567); - - if (__builtin_expect(tail,0)) { - auto dst = (double*)ptr; - if (tail > 0) { _mm_storel_pd(dst+0, _01); } - if (tail > 1) { _mm_storeh_pd(dst+1, _01); } - if (tail > 2) { _mm_storel_pd(dst+2, _23); } - if (tail > 3) { _mm_storeh_pd(dst+3, _23); } - if (tail > 4) { _mm_storel_pd(dst+4, _45); } - if (tail > 5) { _mm_storeh_pd(dst+5, _45); } - if (tail > 6) { _mm_storel_pd(dst+6, _67); } - } else { - _mm_storeu_si128((__m128i*)ptr + 0, _01); - _mm_storeu_si128((__m128i*)ptr + 1, _23); - _mm_storeu_si128((__m128i*)ptr + 2, _45); - _mm_storeu_si128((__m128i*)ptr + 3, _67); - } -#elif defined(__AVX__) - auto float_to_half = [&](F f) { - return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent, - >> 13; // then line up the mantissa. - }; - U32 R = float_to_half(r), - G = float_to_half(g), - B = float_to_half(b), - A = float_to_half(a); - auto r0123 = _mm256_extractf128_si256(R, 0), - r4567 = _mm256_extractf128_si256(R, 1), - g0123 = _mm256_extractf128_si256(G, 0), - g4567 = _mm256_extractf128_si256(G, 1), - b0123 = _mm256_extractf128_si256(B, 0), - b4567 = _mm256_extractf128_si256(B, 1), - a0123 = _mm256_extractf128_si256(A, 0), - a4567 = _mm256_extractf128_si256(A, 1); - auto rg0123 = r0123 | _mm_slli_si128(g0123,2), - rg4567 = r4567 | _mm_slli_si128(g4567,2), - ba0123 = b0123 | _mm_slli_si128(a0123,2), - ba4567 = b4567 | _mm_slli_si128(a4567,2); - - auto _01 = _mm_unpacklo_epi32(rg0123, ba0123), - _23 = _mm_unpackhi_epi32(rg0123, ba0123), - _45 = _mm_unpacklo_epi32(rg4567, ba4567), - _67 = _mm_unpackhi_epi32(rg4567, ba4567); - - if (__builtin_expect(tail,0)) { - auto dst = (double*)ptr; - if (tail > 0) { _mm_storel_pd(dst+0, _01); } - if (tail > 1) { _mm_storeh_pd(dst+1, _01); } - if (tail > 2) { _mm_storel_pd(dst+2, _23); } - if (tail > 3) { _mm_storeh_pd(dst+3, _23); } - if (tail > 4) { _mm_storel_pd(dst+4, _45); } - if (tail > 5) { _mm_storeh_pd(dst+5, _45); } - if (tail > 6) { _mm_storel_pd(dst+6, _67); } - } else { - _mm_storeu_si128((__m128i*)ptr + 0, _01); - _mm_storeu_si128((__m128i*)ptr + 1, _23); - _mm_storeu_si128((__m128i*)ptr + 2, _45); - _mm_storeu_si128((__m128i*)ptr + 3, _67); - } -#elif defined(__SSE2__) - auto float_to_half = [&](F f) { - return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent, - >> 13; // then line up the mantissa. - }; - U32 R = float_to_half(r), - G = float_to_half(g), - B = float_to_half(b), - A = float_to_half(a); - U32 rg = R | _mm_slli_si128(G,2), - ba = B | _mm_slli_si128(A,2); - _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba)); - _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba)); -#endif + store4(ptr,tail, to_half(r) + , to_half(g) + , to_half(b) + , to_half(a)); } STAGE(store_f32) { diff --git a/src/jumper/SkJumper_vectors.h b/src/jumper/SkJumper_vectors.h index 3e9edd8269..1685da9aa9 100644 --- a/src/jumper/SkJumper_vectors.h +++ b/src/jumper/SkJumper_vectors.h @@ -48,12 +48,23 @@ *b = ptr[2]; *a = ptr[3]; } + SI void store4(void* vptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { + auto ptr = (uint16_t*)vptr; + ptr[0] = r; + ptr[1] = g; + ptr[2] = b; + ptr[3] = a; + } SI F from_half(U16 h) { if ((int16_t)h < 0x0400) { h = 0; } // Flush denorm and negative to zero. return bit_cast<F>(h << 13) // Line up the mantissa, * bit_cast<F>(U32(0x77800000)); // then fix up the exponent. } + SI U16 to_half(F f) { + return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent, + >> 13; // then line up the mantissa. + } #elif defined(__aarch64__) #include <arm_neon.h> @@ -88,11 +99,14 @@ *b = rgba.val[2]; *a = rgba.val[3]; } - - SI F from_half(U16 h) { - return vcvt_f32_f16(h); + SI void store4(void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { + uint16x4x4_t rgba = {{r,g,b,a}}; + vst4_u16((uint16_t*)ptr, rgba); } + SI F from_half(U16 h) { return vcvt_f32_f16(h); } + SI U16 to_half(F f) { return vcvt_f16_f32(f); } + #elif defined(__arm__) #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__) #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb. @@ -135,12 +149,27 @@ *b = unaligned_load<U16>(rgba.val+2); *a = unaligned_load<U16>(rgba.val+3); } + SI void store4(void* vptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { + auto ptr = (uint16_t*)vptr; + uint16x4x4_t rgba = {{ + widen_cast<uint16x4_t>(r), + widen_cast<uint16x4_t>(g), + widen_cast<uint16x4_t>(b), + widen_cast<uint16x4_t>(a), + }}; + vst4_lane_u16(ptr + 0, rgba, 0); + vst4_lane_u16(ptr + 4, rgba, 1); + } SI F from_half(U16 h) { - uint16x4_t v; - memcpy(&v, &h, sizeof(h)); + auto v = widen_cast<uint16x4_t>(h); return vget_low_f32(vcvt_f32_f16(v)); } + SI U16 to_half(F f) { + auto v = widen_cast<float32x4_t>(f); + uint16x4_t h = vcvt_f16_f32(v); + return unaligned_load<U16>(&h); + } #elif defined(__AVX__) #include <immintrin.h> @@ -222,6 +251,33 @@ *b = _mm_unpacklo_epi64(ba0123, ba4567); *a = _mm_unpackhi_epi64(ba0123, ba4567); } + SI void store4(void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { + auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3 + rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7 + ba0123 = _mm_unpacklo_epi16(b, a), + ba4567 = _mm_unpackhi_epi16(b, a); + + auto _01 = _mm_unpacklo_epi32(rg0123, ba0123), + _23 = _mm_unpackhi_epi32(rg0123, ba0123), + _45 = _mm_unpacklo_epi32(rg4567, ba4567), + _67 = _mm_unpackhi_epi32(rg4567, ba4567); + + if (__builtin_expect(tail,0)) { + auto dst = (double*)ptr; + if (tail > 0) { _mm_storel_pd(dst+0, _01); } + if (tail > 1) { _mm_storeh_pd(dst+1, _01); } + if (tail > 2) { _mm_storel_pd(dst+2, _23); } + if (tail > 3) { _mm_storeh_pd(dst+3, _23); } + if (tail > 4) { _mm_storel_pd(dst+4, _45); } + if (tail > 5) { _mm_storeh_pd(dst+5, _45); } + if (tail > 6) { _mm_storel_pd(dst+6, _67); } + } else { + _mm_storeu_si128((__m128i*)ptr + 0, _01); + _mm_storeu_si128((__m128i*)ptr + 1, _23); + _mm_storeu_si128((__m128i*)ptr + 2, _45); + _mm_storeu_si128((__m128i*)ptr + 3, _67); + } + } SI F from_half(U16 h) { #if defined(__AVX2__) @@ -237,6 +293,14 @@ * bit_cast<F>(U32(0x77800000_i)); // then fix up the exponent. #endif } + SI U16 to_half(F f) { + #if defined(__AVX2__) + return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION); + #else + return pack(bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent, + >> 13); // then line up the mantissa. + #endif + } #elif defined(__SSE2__) #include <immintrin.h> @@ -266,8 +330,7 @@ return unaligned_load<U16>(&p); // We have two copies. Return (the lower) one. } SI U8 pack(U16 v) { - __m128i r; - memcpy(&r, &v, sizeof(v)); + auto r = widen_cast<__m128i>(v); r = _mm_packus_epi16(r,r); return unaligned_load<U8>(&r); } @@ -302,10 +365,15 @@ *b = unaligned_load<U16>((uint16_t*)&ba + 0); *a = unaligned_load<U16>((uint16_t*)&ba + 4); } + SI void store4(const void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { + auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)), + ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a)); + _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba)); + _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba)); + } SI F from_half(U16 h) { - __m128i v; - memcpy(&v, &h, sizeof(h)); + auto v = widen_cast<__m128i>(h); // Same deal as AVX: flush denorms and negatives to zero. v = _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(0x04000400_i)), v); @@ -314,6 +382,10 @@ return bit_cast<F>(w << 13) // Line up the mantissa, * bit_cast<F>(U32(0x77800000_i)); // then fix up the exponent. } + SI U16 to_half(F f) { + return pack(bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent, + >> 13); // then line up the mantissa. + } #endif // We need to be a careful with casts. |