aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-02-23 11:01:52 -0500
committerGravatar Mike Klein <mtklein@chromium.org>2017-02-23 16:55:04 +0000
commitdb356b7213bfd3ed636e158b5427be68adf01bed (patch)
tree670619b126497d1c3d1a604ea156596f8f97645f /src
parentbff4178936a1fdeedd693e82a70c78f36c873915 (diff)
SkJumper: fill in AVX f16 stages, turn on AVX
As far as I can tell, this draws identically to the SSE4.1 backend. Change-Id: Id650db59a84d779b84d45f42e60321732e28d803 Reviewed-on: https://skia-review.googlesource.com/8913 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src')
-rw-r--r--src/jumper/SkJumper.cpp2
-rw-r--r--src/jumper/SkJumper_generated.S87
-rw-r--r--src/jumper/SkJumper_generated_win.S87
-rw-r--r--src/jumper/SkJumper_stages.cpp79
4 files changed, 248 insertions, 7 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index b5271a6a58..4cfd78d126 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -276,7 +276,7 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
return false;
}
}
- if (0 && SkCpu::Supports(SkCpu::AVX)) {
+ if (1 && SkCpu::Supports(SkCpu::AVX)) {
if (!build_and_run(8, lookup_avx, ASM(just_return,avx), ASM(start_pipeline,avx))) {
return false;
}
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index c313e691c0..0c805a4652 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -2401,12 +2401,99 @@ _sk_store_8888_avx:
.globl _sk_load_f16_avx
_sk_load_f16_avx:
.byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 72,139,0 // mov (%rax),%rax
+ .byte 197,250,111,4,248 // vmovdqu (%rax,%rdi,8),%xmm0
+ .byte 197,250,111,76,248,16 // vmovdqu 0x10(%rax,%rdi,8),%xmm1
+ .byte 197,250,111,84,248,32 // vmovdqu 0x20(%rax,%rdi,8),%xmm2
+ .byte 197,250,111,92,248,48 // vmovdqu 0x30(%rax,%rdi,8),%xmm3
+ .byte 197,121,97,193 // vpunpcklwd %xmm1,%xmm0,%xmm8
+ .byte 197,249,105,193 // vpunpckhwd %xmm1,%xmm0,%xmm0
+ .byte 197,233,97,203 // vpunpcklwd %xmm3,%xmm2,%xmm1
+ .byte 197,233,105,211 // vpunpckhwd %xmm3,%xmm2,%xmm2
+ .byte 197,185,97,216 // vpunpcklwd %xmm0,%xmm8,%xmm3
+ .byte 197,185,105,192 // vpunpckhwd %xmm0,%xmm8,%xmm0
+ .byte 197,113,97,194 // vpunpcklwd %xmm2,%xmm1,%xmm8
+ .byte 197,113,105,202 // vpunpckhwd %xmm2,%xmm1,%xmm9
+ .byte 197,249,110,82,100 // vmovd 0x64(%rdx),%xmm2
+ .byte 197,249,112,210,0 // vpshufd $0x0,%xmm2,%xmm2
+ .byte 197,233,101,203 // vpcmpgtw %xmm3,%xmm2,%xmm1
+ .byte 197,241,223,203 // vpandn %xmm3,%xmm1,%xmm1
+ .byte 197,233,101,216 // vpcmpgtw %xmm0,%xmm2,%xmm3
+ .byte 197,225,223,192 // vpandn %xmm0,%xmm3,%xmm0
+ .byte 196,193,105,101,216 // vpcmpgtw %xmm8,%xmm2,%xmm3
+ .byte 196,193,97,223,216 // vpandn %xmm8,%xmm3,%xmm3
+ .byte 196,193,105,101,209 // vpcmpgtw %xmm9,%xmm2,%xmm2
+ .byte 196,193,105,223,209 // vpandn %xmm9,%xmm2,%xmm2
+ .byte 196,98,121,51,193 // vpmovzxwd %xmm1,%xmm8
+ .byte 196,98,121,51,203 // vpmovzxwd %xmm3,%xmm9
+ .byte 196,65,41,239,210 // vpxor %xmm10,%xmm10,%xmm10
+ .byte 196,193,113,105,202 // vpunpckhwd %xmm10,%xmm1,%xmm1
+ .byte 196,193,97,105,218 // vpunpckhwd %xmm10,%xmm3,%xmm3
+ .byte 196,98,121,51,216 // vpmovzxwd %xmm0,%xmm11
+ .byte 196,98,121,51,226 // vpmovzxwd %xmm2,%xmm12
+ .byte 196,65,121,105,234 // vpunpckhwd %xmm10,%xmm0,%xmm13
+ .byte 196,65,105,105,210 // vpunpckhwd %xmm10,%xmm2,%xmm10
+ .byte 196,193,121,114,240,13 // vpslld $0xd,%xmm8,%xmm0
+ .byte 196,193,105,114,241,13 // vpslld $0xd,%xmm9,%xmm2
+ .byte 196,227,125,24,194,1 // vinsertf128 $0x1,%xmm2,%ymm0,%ymm0
+ .byte 197,249,110,82,92 // vmovd 0x5c(%rdx),%xmm2
+ .byte 196,227,121,4,210,0 // vpermilps $0x0,%xmm2,%xmm2
+ .byte 196,99,109,24,194,1 // vinsertf128 $0x1,%xmm2,%ymm2,%ymm8
+ .byte 197,188,89,192 // vmulps %ymm0,%ymm8,%ymm0
+ .byte 197,241,114,241,13 // vpslld $0xd,%xmm1,%xmm1
+ .byte 197,233,114,243,13 // vpslld $0xd,%xmm3,%xmm2
+ .byte 196,227,117,24,202,1 // vinsertf128 $0x1,%xmm2,%ymm1,%ymm1
+ .byte 197,188,89,201 // vmulps %ymm1,%ymm8,%ymm1
+ .byte 196,193,105,114,243,13 // vpslld $0xd,%xmm11,%xmm2
+ .byte 196,193,97,114,244,13 // vpslld $0xd,%xmm12,%xmm3
+ .byte 196,227,109,24,211,1 // vinsertf128 $0x1,%xmm3,%ymm2,%ymm2
+ .byte 197,188,89,210 // vmulps %ymm2,%ymm8,%ymm2
+ .byte 196,193,49,114,245,13 // vpslld $0xd,%xmm13,%xmm9
+ .byte 196,193,97,114,242,13 // vpslld $0xd,%xmm10,%xmm3
+ .byte 196,227,53,24,219,1 // vinsertf128 $0x1,%xmm3,%ymm9,%ymm3
+ .byte 197,188,89,219 // vmulps %ymm3,%ymm8,%ymm3
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
.globl _sk_store_f16_avx
_sk_store_f16_avx:
.byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 72,139,0 // mov (%rax),%rax
+ .byte 197,121,110,66,96 // vmovd 0x60(%rdx),%xmm8
+ .byte 196,67,121,4,192,0 // vpermilps $0x0,%xmm8,%xmm8
+ .byte 196,67,61,24,192,1 // vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
+ .byte 197,60,89,200 // vmulps %ymm0,%ymm8,%ymm9
+ .byte 196,67,125,25,202,1 // vextractf128 $0x1,%ymm9,%xmm10
+ .byte 196,193,41,114,210,13 // vpsrld $0xd,%xmm10,%xmm10
+ .byte 196,193,49,114,209,13 // vpsrld $0xd,%xmm9,%xmm9
+ .byte 197,60,89,217 // vmulps %ymm1,%ymm8,%ymm11
+ .byte 196,67,125,25,220,1 // vextractf128 $0x1,%ymm11,%xmm12
+ .byte 196,193,25,114,212,13 // vpsrld $0xd,%xmm12,%xmm12
+ .byte 196,193,33,114,211,13 // vpsrld $0xd,%xmm11,%xmm11
+ .byte 197,60,89,234 // vmulps %ymm2,%ymm8,%ymm13
+ .byte 196,67,125,25,238,1 // vextractf128 $0x1,%ymm13,%xmm14
+ .byte 196,193,9,114,214,13 // vpsrld $0xd,%xmm14,%xmm14
+ .byte 196,193,17,114,213,13 // vpsrld $0xd,%xmm13,%xmm13
+ .byte 197,60,89,195 // vmulps %ymm3,%ymm8,%ymm8
+ .byte 196,67,125,25,199,1 // vextractf128 $0x1,%ymm8,%xmm15
+ .byte 196,193,1,114,215,13 // vpsrld $0xd,%xmm15,%xmm15
+ .byte 196,193,57,114,208,13 // vpsrld $0xd,%xmm8,%xmm8
+ .byte 196,193,33,115,251,2 // vpslldq $0x2,%xmm11,%xmm11
+ .byte 196,65,33,235,201 // vpor %xmm9,%xmm11,%xmm9
+ .byte 196,193,33,115,252,2 // vpslldq $0x2,%xmm12,%xmm11
+ .byte 196,65,33,235,210 // vpor %xmm10,%xmm11,%xmm10
+ .byte 196,193,57,115,248,2 // vpslldq $0x2,%xmm8,%xmm8
+ .byte 196,65,57,235,197 // vpor %xmm13,%xmm8,%xmm8
+ .byte 196,193,33,115,255,2 // vpslldq $0x2,%xmm15,%xmm11
+ .byte 196,65,33,235,222 // vpor %xmm14,%xmm11,%xmm11
+ .byte 196,65,49,98,224 // vpunpckldq %xmm8,%xmm9,%xmm12
+ .byte 197,122,127,36,248 // vmovdqu %xmm12,(%rax,%rdi,8)
+ .byte 196,65,49,106,192 // vpunpckhdq %xmm8,%xmm9,%xmm8
+ .byte 197,122,127,68,248,16 // vmovdqu %xmm8,0x10(%rax,%rdi,8)
+ .byte 196,65,41,98,195 // vpunpckldq %xmm11,%xmm10,%xmm8
+ .byte 197,122,127,68,248,32 // vmovdqu %xmm8,0x20(%rax,%rdi,8)
+ .byte 196,65,41,106,195 // vpunpckhdq %xmm11,%xmm10,%xmm8
+ .byte 197,122,127,68,248,48 // vmovdqu %xmm8,0x30(%rax,%rdi,8)
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index ea620945a3..41adfcf656 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -1163,12 +1163,99 @@ _sk_store_8888_avx LABEL PROC
PUBLIC _sk_load_f16_avx
_sk_load_f16_avx LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,139,0 ; mov (%rax),%rax
+ DB 197,250,111,4,248 ; vmovdqu (%rax,%rdi,8),%xmm0
+ DB 197,250,111,76,248,16 ; vmovdqu 0x10(%rax,%rdi,8),%xmm1
+ DB 197,250,111,84,248,32 ; vmovdqu 0x20(%rax,%rdi,8),%xmm2
+ DB 197,250,111,92,248,48 ; vmovdqu 0x30(%rax,%rdi,8),%xmm3
+ DB 197,121,97,193 ; vpunpcklwd %xmm1,%xmm0,%xmm8
+ DB 197,249,105,193 ; vpunpckhwd %xmm1,%xmm0,%xmm0
+ DB 197,233,97,203 ; vpunpcklwd %xmm3,%xmm2,%xmm1
+ DB 197,233,105,211 ; vpunpckhwd %xmm3,%xmm2,%xmm2
+ DB 197,185,97,216 ; vpunpcklwd %xmm0,%xmm8,%xmm3
+ DB 197,185,105,192 ; vpunpckhwd %xmm0,%xmm8,%xmm0
+ DB 197,113,97,194 ; vpunpcklwd %xmm2,%xmm1,%xmm8
+ DB 197,113,105,202 ; vpunpckhwd %xmm2,%xmm1,%xmm9
+ DB 197,249,110,82,100 ; vmovd 0x64(%rdx),%xmm2
+ DB 197,249,112,210,0 ; vpshufd $0x0,%xmm2,%xmm2
+ DB 197,233,101,203 ; vpcmpgtw %xmm3,%xmm2,%xmm1
+ DB 197,241,223,203 ; vpandn %xmm3,%xmm1,%xmm1
+ DB 197,233,101,216 ; vpcmpgtw %xmm0,%xmm2,%xmm3
+ DB 197,225,223,192 ; vpandn %xmm0,%xmm3,%xmm0
+ DB 196,193,105,101,216 ; vpcmpgtw %xmm8,%xmm2,%xmm3
+ DB 196,193,97,223,216 ; vpandn %xmm8,%xmm3,%xmm3
+ DB 196,193,105,101,209 ; vpcmpgtw %xmm9,%xmm2,%xmm2
+ DB 196,193,105,223,209 ; vpandn %xmm9,%xmm2,%xmm2
+ DB 196,98,121,51,193 ; vpmovzxwd %xmm1,%xmm8
+ DB 196,98,121,51,203 ; vpmovzxwd %xmm3,%xmm9
+ DB 196,65,41,239,210 ; vpxor %xmm10,%xmm10,%xmm10
+ DB 196,193,113,105,202 ; vpunpckhwd %xmm10,%xmm1,%xmm1
+ DB 196,193,97,105,218 ; vpunpckhwd %xmm10,%xmm3,%xmm3
+ DB 196,98,121,51,216 ; vpmovzxwd %xmm0,%xmm11
+ DB 196,98,121,51,226 ; vpmovzxwd %xmm2,%xmm12
+ DB 196,65,121,105,234 ; vpunpckhwd %xmm10,%xmm0,%xmm13
+ DB 196,65,105,105,210 ; vpunpckhwd %xmm10,%xmm2,%xmm10
+ DB 196,193,121,114,240,13 ; vpslld $0xd,%xmm8,%xmm0
+ DB 196,193,105,114,241,13 ; vpslld $0xd,%xmm9,%xmm2
+ DB 196,227,125,24,194,1 ; vinsertf128 $0x1,%xmm2,%ymm0,%ymm0
+ DB 197,249,110,82,92 ; vmovd 0x5c(%rdx),%xmm2
+ DB 196,227,121,4,210,0 ; vpermilps $0x0,%xmm2,%xmm2
+ DB 196,99,109,24,194,1 ; vinsertf128 $0x1,%xmm2,%ymm2,%ymm8
+ DB 197,188,89,192 ; vmulps %ymm0,%ymm8,%ymm0
+ DB 197,241,114,241,13 ; vpslld $0xd,%xmm1,%xmm1
+ DB 197,233,114,243,13 ; vpslld $0xd,%xmm3,%xmm2
+ DB 196,227,117,24,202,1 ; vinsertf128 $0x1,%xmm2,%ymm1,%ymm1
+ DB 197,188,89,201 ; vmulps %ymm1,%ymm8,%ymm1
+ DB 196,193,105,114,243,13 ; vpslld $0xd,%xmm11,%xmm2
+ DB 196,193,97,114,244,13 ; vpslld $0xd,%xmm12,%xmm3
+ DB 196,227,109,24,211,1 ; vinsertf128 $0x1,%xmm3,%ymm2,%ymm2
+ DB 197,188,89,210 ; vmulps %ymm2,%ymm8,%ymm2
+ DB 196,193,49,114,245,13 ; vpslld $0xd,%xmm13,%xmm9
+ DB 196,193,97,114,242,13 ; vpslld $0xd,%xmm10,%xmm3
+ DB 196,227,53,24,219,1 ; vinsertf128 $0x1,%xmm3,%ymm9,%ymm3
+ DB 197,188,89,219 ; vmulps %ymm3,%ymm8,%ymm3
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
PUBLIC _sk_store_f16_avx
_sk_store_f16_avx LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,139,0 ; mov (%rax),%rax
+ DB 197,121,110,66,96 ; vmovd 0x60(%rdx),%xmm8
+ DB 196,67,121,4,192,0 ; vpermilps $0x0,%xmm8,%xmm8
+ DB 196,67,61,24,192,1 ; vinsertf128 $0x1,%xmm8,%ymm8,%ymm8
+ DB 197,60,89,200 ; vmulps %ymm0,%ymm8,%ymm9
+ DB 196,67,125,25,202,1 ; vextractf128 $0x1,%ymm9,%xmm10
+ DB 196,193,41,114,210,13 ; vpsrld $0xd,%xmm10,%xmm10
+ DB 196,193,49,114,209,13 ; vpsrld $0xd,%xmm9,%xmm9
+ DB 197,60,89,217 ; vmulps %ymm1,%ymm8,%ymm11
+ DB 196,67,125,25,220,1 ; vextractf128 $0x1,%ymm11,%xmm12
+ DB 196,193,25,114,212,13 ; vpsrld $0xd,%xmm12,%xmm12
+ DB 196,193,33,114,211,13 ; vpsrld $0xd,%xmm11,%xmm11
+ DB 197,60,89,234 ; vmulps %ymm2,%ymm8,%ymm13
+ DB 196,67,125,25,238,1 ; vextractf128 $0x1,%ymm13,%xmm14
+ DB 196,193,9,114,214,13 ; vpsrld $0xd,%xmm14,%xmm14
+ DB 196,193,17,114,213,13 ; vpsrld $0xd,%xmm13,%xmm13
+ DB 197,60,89,195 ; vmulps %ymm3,%ymm8,%ymm8
+ DB 196,67,125,25,199,1 ; vextractf128 $0x1,%ymm8,%xmm15
+ DB 196,193,1,114,215,13 ; vpsrld $0xd,%xmm15,%xmm15
+ DB 196,193,57,114,208,13 ; vpsrld $0xd,%xmm8,%xmm8
+ DB 196,193,33,115,251,2 ; vpslldq $0x2,%xmm11,%xmm11
+ DB 196,65,33,235,201 ; vpor %xmm9,%xmm11,%xmm9
+ DB 196,193,33,115,252,2 ; vpslldq $0x2,%xmm12,%xmm11
+ DB 196,65,33,235,210 ; vpor %xmm10,%xmm11,%xmm10
+ DB 196,193,57,115,248,2 ; vpslldq $0x2,%xmm8,%xmm8
+ DB 196,65,57,235,197 ; vpor %xmm13,%xmm8,%xmm8
+ DB 196,193,33,115,255,2 ; vpslldq $0x2,%xmm15,%xmm11
+ DB 196,65,33,235,222 ; vpor %xmm14,%xmm11,%xmm11
+ DB 196,65,49,98,224 ; vpunpckldq %xmm8,%xmm9,%xmm12
+ DB 197,122,127,36,248 ; vmovdqu %xmm12,(%rax,%rdi,8)
+ DB 196,65,49,106,192 ; vpunpckhdq %xmm8,%xmm9,%xmm8
+ DB 197,122,127,68,248,16 ; vmovdqu %xmm8,0x10(%rax,%rdi,8)
+ DB 196,65,41,98,195 ; vpunpckldq %xmm11,%xmm10,%xmm8
+ DB 197,122,127,68,248,32 ; vmovdqu %xmm8,0x20(%rax,%rdi,8)
+ DB 196,65,41,106,195 ; vpunpckhdq %xmm11,%xmm10,%xmm8
+ DB 197,122,127,68,248,48 ; vmovdqu %xmm8,0x30(%rax,%rdi,8)
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index 21e3c3590b..6437c122fa 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -524,7 +524,49 @@ STAGE(load_f16) {
b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
#elif defined(__AVX__)
- // TODO
+ auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
+ _23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
+ _45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
+ _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
+
+ auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
+ _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
+ _46 = _mm_unpacklo_epi16(_45, _67),
+ _57 = _mm_unpackhi_epi16(_45, _67);
+
+ auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
+ ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3
+ rg4567 = _mm_unpacklo_epi16(_46, _57),
+ ba4567 = _mm_unpackhi_epi16(_46, _57);
+
+ // half_to_float() slows down ~10x for denorm inputs, so we flush them to zero.
+ // With a signed comparison this conveniently also flushes negative half floats to zero.
+ auto ftz = [k](__m128i v) {
+ return _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(k->_0x04000400)), v);
+ };
+ rg0123 = ftz(rg0123);
+ ba0123 = ftz(ba0123);
+ rg4567 = ftz(rg4567);
+ ba4567 = ftz(ba4567);
+
+ U32 R = _mm256_setr_m128i(_mm_unpacklo_epi16(rg0123, _mm_setzero_si128()),
+ _mm_unpacklo_epi16(rg4567, _mm_setzero_si128())),
+ G = _mm256_setr_m128i(_mm_unpackhi_epi16(rg0123, _mm_setzero_si128()),
+ _mm_unpackhi_epi16(rg4567, _mm_setzero_si128())),
+ B = _mm256_setr_m128i(_mm_unpacklo_epi16(ba0123, _mm_setzero_si128()),
+ _mm_unpacklo_epi16(ba4567, _mm_setzero_si128())),
+ A = _mm256_setr_m128i(_mm_unpackhi_epi16(ba0123, _mm_setzero_si128()),
+ _mm_unpackhi_epi16(ba4567, _mm_setzero_si128()));
+
+ auto half_to_float = [&](U32 h) {
+ return bit_cast<F>(h << 13) // Line up the mantissa,
+ * bit_cast<F>(U32(k->_0x77800000)); // then fix up the exponent.
+ };
+
+ r = half_to_float(R);
+ g = half_to_float(G);
+ b = half_to_float(B);
+ a = half_to_float(A);
#elif defined(__SSE2__)
auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
@@ -536,10 +578,12 @@ STAGE(load_f16) {
auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3
- // half_to_float() slows down ~10x for denorm inputs, so we flush them to zero.
- // With a signed comparison this conveniently also flushes negative half floats to zero.
- rg = _mm_andnot_si128(_mm_cmplt_epi16(rg, U32(k->_0x04000400)), rg);
- ba = _mm_andnot_si128(_mm_cmplt_epi16(ba, U32(k->_0x04000400)), ba);
+ // Same deal as AVX, flush denorms and negatives to zero.
+ auto ftz = [k](__m128i v) {
+ return _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(k->_0x04000400)), v);
+ };
+ rg = ftz(rg);
+ ba = ftz(ba);
auto half_to_float = [&](U32 h) {
return bit_cast<F>(h << 13) // Line up the mantissa,
@@ -596,7 +640,30 @@ STAGE(store_f16) {
_mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
_mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
#elif defined(__AVX__)
- // TODO
+ auto float_to_half = [&](F f) {
+ return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000))) // Fix up the exponent,
+ >> 13; // then line up the mantissa.
+ };
+ U32 R = float_to_half(r),
+ G = float_to_half(g),
+ B = float_to_half(b),
+ A = float_to_half(a);
+ auto r0123 = _mm256_extractf128_si256(R, 0),
+ r4567 = _mm256_extractf128_si256(R, 1),
+ g0123 = _mm256_extractf128_si256(G, 0),
+ g4567 = _mm256_extractf128_si256(G, 1),
+ b0123 = _mm256_extractf128_si256(B, 0),
+ b4567 = _mm256_extractf128_si256(B, 1),
+ a0123 = _mm256_extractf128_si256(A, 0),
+ a4567 = _mm256_extractf128_si256(A, 1);
+ auto rg0123 = r0123 | _mm_slli_si128(g0123,2),
+ rg4567 = r4567 | _mm_slli_si128(g4567,2),
+ ba0123 = b0123 | _mm_slli_si128(a0123,2),
+ ba4567 = b4567 | _mm_slli_si128(a4567,2);
+ _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123));
+ _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
+ _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
+ _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
#elif defined(__SSE2__)
auto float_to_half = [&](F f) {
return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000))) // Fix up the exponent,