diff options
author | Mike Klein <mtklein@chromium.org> | 2017-08-29 08:40:48 -0400 |
---|---|---|
committer | Mike Klein <mtklein@chromium.org> | 2017-08-29 17:04:47 +0000 |
commit | 9d7e57d509149dd2fcb3ba73ea8f4cdce11f84bd (patch) | |
tree | 5442beb60c037b62ebc9477742d6490fb6dcac20 /src/jumper | |
parent | 6d13575108299951ecdfba6d85c915fcec2bc028 (diff) |
Revert "Revert "8-bit jumper on armv8""
This reverts commit 6d13575108299951ecdfba6d85c915fcec2bc028.
Now with guards for "errors" like this:
external/skia/src/jumper/SkJumper_stages_8bit.cpp:240:50: error:
'memcpy' called with size bigger than buffer
case 12: memcpy(&v, src, 12*sizeof(T)); break;
This code is unreachable and generally removed by Clang's optimizer
anyway... as far as I can tell the code generation diff is arbitrary.
Change-Id: I6216567caaa6166f71258bd25343a09e93892a10
Reviewed-on: https://skia-review.googlesource.com/39961
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/jumper')
-rw-r--r-- | src/jumper/SkJumper.cpp | 42 | ||||
-rw-r--r-- | src/jumper/SkJumper_generated.S | 44 | ||||
-rw-r--r-- | src/jumper/SkJumper_generated_win.S | 66 | ||||
-rw-r--r-- | src/jumper/SkJumper_stages_8bit.cpp | 41 |
4 files changed, 118 insertions, 75 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp index 9f8e970f32..315110faf2 100644 --- a/src/jumper/SkJumper.cpp +++ b/src/jumper/SkJumper.cpp @@ -110,7 +110,7 @@ using StartPipelineFn = void(size_t,size_t,size_t,size_t, void**,K*); extern "C" { #if __has_feature(memory_sanitizer) - // We'll just run portable code. + // We'll just run baseline code. #elif defined(__arm__) StartPipelineFn ASM(start_pipeline,vfp4); @@ -168,12 +168,22 @@ extern "C" { #endif - // Portable, single-pixel stages. + // Baseline code compiled as a normal part of Skia. StartPipelineFn sk_start_pipeline; StageFn sk_just_return; #define M(st) StageFn sk_##st; SK_RASTER_PIPELINE_STAGES(M) #undef M + +#if defined(__clang__) && defined(__aarch64__) + // We also compile 8-bit stages on ARMv8 as a normal part of Skia when compiled with Clang. + StartPipelineFn sk_start_pipeline_8bit; + StageFn sk_just_return_8bit; + #define M(st) StageFn sk_##st##_8bit; + SK_RASTER_PIPELINE_STAGES(M) + #undef M +#endif + } #if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64)) @@ -198,6 +208,16 @@ extern "C" { } LOWP_STAGES(M) #undef M +#elif defined(__clang__) && defined(__aarch64__) + template <SkRasterPipeline::StockStage st> + static constexpr StageFn* aarch64_8bit() { return nullptr; } + + #define M(st) \ + template <> constexpr StageFn* aarch64_8bit<SkRasterPipeline::st>() { \ + return sk_##st##_8bit; \ + } + LOWP_STAGES(M) + #undef M #endif // Engines comprise everything we need to run SkRasterPipelines. @@ -207,20 +227,20 @@ struct SkJumper_Engine { StageFn* just_return; }; -// We'll default to this portable engine, but try to choose a better one at runtime. -static const SkJumper_Engine kPortable = { +// We'll default to this baseline engine, but try to choose a better one at runtime. +static const SkJumper_Engine kBaseline = { #define M(stage) sk_##stage, { SK_RASTER_PIPELINE_STAGES(M) }, #undef M sk_start_pipeline, sk_just_return, }; -static SkJumper_Engine gEngine = kPortable; +static SkJumper_Engine gEngine = kBaseline; static SkOnce gChooseEngineOnce; static SkJumper_Engine choose_engine() { #if __has_feature(memory_sanitizer) - // We'll just run portable code. + // We'll just run baseline code. #elif defined(__arm__) if (1 && SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) { @@ -283,7 +303,7 @@ static SkJumper_Engine choose_engine() { } #endif - return kPortable; + return kBaseline; } #ifndef SK_JUMPER_DISABLE_8BIT @@ -326,6 +346,14 @@ static SkJumper_Engine choose_engine() { #undef M }; } + #elif defined(__clang__) && defined(__aarch64__) + return { + #define M(st) aarch64_8bit<SkRasterPipeline::st>(), + { SK_RASTER_PIPELINE_STAGES(M) }, + sk_start_pipeline_8bit, + sk_just_return_8bit, + #undef M + }; #endif return kNone; } diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S index eeb3a88d77..465095b67f 100644 --- a/src/jumper/SkJumper_generated.S +++ b/src/jumper/SkJumper_generated.S @@ -50207,9 +50207,9 @@ _sk_load_a8_sse2_8bit: .byte 117,48 // jne 28f81 <_sk_load_a8_sse2_8bit+0x4d> .byte 243,66,15,126,4,2 // movq (%rdx,%r8,1),%xmm0 .byte 102,15,96,192 // punpcklbw %xmm0,%xmm0 - .byte 102,15,84,5,109,51,0,0 // andpd 0x336d(%rip),%xmm0 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb> + .byte 102,15,219,5,109,51,0,0 // pand 0x336d(%rip),%xmm0 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb> .byte 102,15,239,228 // pxor %xmm4,%xmm4 - .byte 102,15,40,200 // movapd %xmm0,%xmm1 + .byte 102,15,111,200 // movdqa %xmm0,%xmm1 .byte 102,15,105,204 // punpckhwd %xmm4,%xmm1 .byte 102,15,97,196 // punpcklwd %xmm4,%xmm0 .byte 102,15,114,240,24 // pslld $0x18,%xmm0 @@ -50284,9 +50284,9 @@ _sk_load_a8_dst_sse2_8bit: .byte 117,48 // jne 29075 <_sk_load_a8_dst_sse2_8bit+0x4d> .byte 243,66,15,126,20,2 // movq (%rdx,%r8,1),%xmm2 .byte 102,15,96,208 // punpcklbw %xmm0,%xmm2 - .byte 102,15,84,21,121,50,0,0 // andpd 0x3279(%rip),%xmm2 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb> + .byte 102,15,219,21,121,50,0,0 // pand 0x3279(%rip),%xmm2 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb> .byte 102,15,239,228 // pxor %xmm4,%xmm4 - .byte 102,15,40,218 // movapd %xmm2,%xmm3 + .byte 102,15,111,218 // movdqa %xmm2,%xmm3 .byte 102,15,105,220 // punpckhwd %xmm4,%xmm3 .byte 102,15,97,212 // punpcklwd %xmm4,%xmm2 .byte 102,15,114,242,24 // pslld $0x18,%xmm2 @@ -50382,26 +50382,26 @@ _sk_store_a8_sse2_8bit: .byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax .byte 72,1,200 // add %rcx,%rax .byte 255,224 // jmpq *%rax - .byte 102,15,127,100,36,168 // movdqa %xmm4,-0x58(%rsp) - .byte 138,68,36,168 // mov -0x58(%rsp),%al + .byte 102,15,127,100,36,232 // movdqa %xmm4,-0x18(%rsp) + .byte 138,68,36,232 // mov -0x18(%rsp),%al .byte 66,136,4,2 // mov %al,(%rdx,%r8,1) .byte 235,203 // jmp 29175 <_sk_store_a8_sse2_8bit+0x59> - .byte 102,15,127,100,36,184 // movdqa %xmm4,-0x48(%rsp) - .byte 138,68,36,188 // mov -0x44(%rsp),%al + .byte 102,15,127,100,36,216 // movdqa %xmm4,-0x28(%rsp) + .byte 138,68,36,220 // mov -0x24(%rsp),%al .byte 66,136,68,2,2 // mov %al,0x2(%rdx,%r8,1) .byte 102,15,219,37,15,49,0,0 // pand 0x310f(%rip),%xmm4 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb> .byte 102,15,103,228 // packuswb %xmm4,%xmm4 .byte 102,15,126,224 // movd %xmm4,%eax .byte 102,66,137,4,2 // mov %ax,(%rdx,%r8,1) .byte 235,165 // jmp 29175 <_sk_store_a8_sse2_8bit+0x59> - .byte 102,15,127,100,36,232 // movdqa %xmm4,-0x18(%rsp) - .byte 138,68,36,244 // mov -0xc(%rsp),%al + .byte 102,15,127,100,36,200 // movdqa %xmm4,-0x38(%rsp) + .byte 138,68,36,212 // mov -0x2c(%rsp),%al .byte 66,136,68,2,6 // mov %al,0x6(%rdx,%r8,1) - .byte 102,15,127,100,36,216 // movdqa %xmm4,-0x28(%rsp) - .byte 138,68,36,226 // mov -0x1e(%rsp),%al + .byte 102,15,127,100,36,184 // movdqa %xmm4,-0x48(%rsp) + .byte 138,68,36,194 // mov -0x3e(%rsp),%al .byte 66,136,68,2,5 // mov %al,0x5(%rdx,%r8,1) - .byte 102,15,127,100,36,200 // movdqa %xmm4,-0x38(%rsp) - .byte 138,68,36,208 // mov -0x30(%rsp),%al + .byte 102,15,127,100,36,168 // movdqa %xmm4,-0x58(%rsp) + .byte 138,68,36,176 // mov -0x50(%rsp),%al .byte 66,136,68,2,4 // mov %al,0x4(%rdx,%r8,1) .byte 102,15,219,37,203,48,0,0 // pand 0x30cb(%rip),%xmm4 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb> .byte 102,15,103,228 // packuswb %xmm4,%xmm4 @@ -50440,9 +50440,9 @@ _sk_load_g8_sse2_8bit: .byte 117,116 // jne 292c1 <_sk_load_g8_sse2_8bit+0x91> .byte 243,66,15,126,4,2 // movq (%rdx,%r8,1),%xmm0 .byte 102,15,96,192 // punpcklbw %xmm0,%xmm0 - .byte 102,15,84,5,113,48,0,0 // andpd 0x3071(%rip),%xmm0 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb> + .byte 102,15,219,5,113,48,0,0 // pand 0x3071(%rip),%xmm0 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb> .byte 102,15,239,201 // pxor %xmm1,%xmm1 - .byte 102,15,40,224 // movapd %xmm0,%xmm4 + .byte 102,15,111,224 // movdqa %xmm0,%xmm4 .byte 102,15,97,225 // punpcklwd %xmm1,%xmm4 .byte 102,15,105,193 // punpckhwd %xmm1,%xmm0 .byte 102,15,111,45,169,55,0,0 // movdqa 0x37a9(%rip),%xmm5 # 2ca20 <_sk_overlay_sse2_8bit+0x153b> @@ -50532,9 +50532,9 @@ _sk_load_g8_dst_sse2_8bit: .byte 117,116 // jne 29401 <_sk_load_g8_dst_sse2_8bit+0x91> .byte 243,66,15,126,20,2 // movq (%rdx,%r8,1),%xmm2 .byte 102,15,96,208 // punpcklbw %xmm0,%xmm2 - .byte 102,15,84,21,49,47,0,0 // andpd 0x2f31(%rip),%xmm2 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb> + .byte 102,15,219,21,49,47,0,0 // pand 0x2f31(%rip),%xmm2 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb> .byte 102,15,239,219 // pxor %xmm3,%xmm3 - .byte 102,15,40,226 // movapd %xmm2,%xmm4 + .byte 102,15,111,226 // movdqa %xmm2,%xmm4 .byte 102,15,97,227 // punpcklwd %xmm3,%xmm4 .byte 102,15,105,211 // punpckhwd %xmm3,%xmm2 .byte 102,15,111,45,105,54,0,0 // movdqa 0x3669(%rip),%xmm5 # 2ca20 <_sk_overlay_sse2_8bit+0x153b> @@ -50815,9 +50815,9 @@ _sk_scale_u8_sse2_8bit: .byte 15,133,239,0,0,0 // jne 298ad <_sk_scale_u8_sse2_8bit+0x110> .byte 243,66,15,126,36,2 // movq (%rdx,%r8,1),%xmm4 .byte 102,15,96,224 // punpcklbw %xmm0,%xmm4 - .byte 102,15,84,37,0,43,0,0 // andpd 0x2b00(%rip),%xmm4 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb> + .byte 102,15,219,37,0,43,0,0 // pand 0x2b00(%rip),%xmm4 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb> .byte 102,69,15,239,192 // pxor %xmm8,%xmm8 - .byte 102,15,40,236 // movapd %xmm4,%xmm5 + .byte 102,15,111,236 // movdqa %xmm4,%xmm5 .byte 102,65,15,105,232 // punpckhwd %xmm8,%xmm5 .byte 102,65,15,97,224 // punpcklwd %xmm8,%xmm4 .byte 102,15,114,244,24 // pslld $0x18,%xmm4 @@ -51005,9 +51005,9 @@ _sk_lerp_u8_sse2_8bit: .byte 15,133,141,1,0,0 // jne 29c44 <_sk_lerp_u8_sse2_8bit+0x1ae> .byte 243,66,15,126,44,2 // movq (%rdx,%r8,1),%xmm5 .byte 102,15,96,232 // punpcklbw %xmm0,%xmm5 - .byte 102,15,84,45,7,40,0,0 // andpd 0x2807(%rip),%xmm5 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb> + .byte 102,15,219,45,7,40,0,0 // pand 0x2807(%rip),%xmm5 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb> .byte 102,69,15,239,192 // pxor %xmm8,%xmm8 - .byte 102,15,40,229 // movapd %xmm5,%xmm4 + .byte 102,15,111,229 // movdqa %xmm5,%xmm4 .byte 102,65,15,105,224 // punpckhwd %xmm8,%xmm4 .byte 102,65,15,97,232 // punpcklwd %xmm8,%xmm5 .byte 102,15,114,245,24 // pslld $0x18,%xmm5 diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S index 99ec6b9fa9..d85a0de655 100644 --- a/src/jumper/SkJumper_generated_win.S +++ b/src/jumper/SkJumper_generated_win.S @@ -39685,9 +39685,9 @@ _sk_load_a8_sse2_8bit LABEL PROC DB 117,48 ; jne 296ad <_sk_load_a8_sse2_8bit+0x4d> DB 243,66,15,126,4,2 ; movq (%rdx,%r8,1),%xmm0 DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0 - DB 102,15,84,5,193,51,0,0 ; andpd 0x33c1(%rip),%xmm0 # 2ca50 <_sk_overlay_sse2_8bit+0xe03> + DB 102,15,219,5,193,51,0,0 ; pand 0x33c1(%rip),%xmm0 # 2ca50 <_sk_overlay_sse2_8bit+0xe03> DB 102,15,239,228 ; pxor %xmm4,%xmm4 - DB 102,15,40,200 ; movapd %xmm0,%xmm1 + DB 102,15,111,200 ; movdqa %xmm0,%xmm1 DB 102,15,105,204 ; punpckhwd %xmm4,%xmm1 DB 102,15,97,196 ; punpcklwd %xmm4,%xmm0 DB 102,15,114,240,24 ; pslld $0x18,%xmm0 @@ -39760,9 +39760,9 @@ _sk_load_a8_dst_sse2_8bit LABEL PROC DB 117,48 ; jne 297a1 <_sk_load_a8_dst_sse2_8bit+0x4d> DB 243,66,15,126,20,2 ; movq (%rdx,%r8,1),%xmm2 DB 102,15,96,208 ; punpcklbw %xmm0,%xmm2 - DB 102,15,84,21,205,50,0,0 ; andpd 0x32cd(%rip),%xmm2 # 2ca50 <_sk_overlay_sse2_8bit+0xe03> + DB 102,15,219,21,205,50,0,0 ; pand 0x32cd(%rip),%xmm2 # 2ca50 <_sk_overlay_sse2_8bit+0xe03> DB 102,15,239,228 ; pxor %xmm4,%xmm4 - DB 102,15,40,218 ; movapd %xmm2,%xmm3 + DB 102,15,111,218 ; movdqa %xmm2,%xmm3 DB 102,15,105,220 ; punpckhwd %xmm4,%xmm3 DB 102,15,97,212 ; punpcklwd %xmm4,%xmm2 DB 102,15,114,242,24 ; pslld $0x18,%xmm2 @@ -39858,48 +39858,46 @@ _sk_store_a8_sse2_8bit LABEL PROC DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax DB 72,1,200 ; add %rcx,%rax DB 255,224 ; jmpq *%rax - DB 102,15,127,36,36 ; movdqa %xmm4,(%rsp) - DB 138,4,36 ; mov (%rsp),%al + DB 102,15,127,100,36,64 ; movdqa %xmm4,0x40(%rsp) + DB 138,68,36,64 ; mov 0x40(%rsp),%al DB 66,136,4,2 ; mov %al,(%rdx,%r8,1) - DB 235,201 ; jmp 298a5 <_sk_store_a8_sse2_8bit+0x5d> - DB 102,15,127,100,36,16 ; movdqa %xmm4,0x10(%rsp) - DB 138,68,36,20 ; mov 0x14(%rsp),%al + DB 235,199 ; jmp 298a5 <_sk_store_a8_sse2_8bit+0x5d> + DB 102,15,127,100,36,48 ; movdqa %xmm4,0x30(%rsp) + DB 138,68,36,52 ; mov 0x34(%rsp),%al DB 66,136,68,2,2 ; mov %al,0x2(%rdx,%r8,1) - DB 102,15,219,37,93,49,0,0 ; pand 0x315d(%rip),%xmm4 # 2ca50 <_sk_overlay_sse2_8bit+0xe03> + DB 102,15,219,37,91,49,0,0 ; pand 0x315b(%rip),%xmm4 # 2ca50 <_sk_overlay_sse2_8bit+0xe03> DB 102,15,103,228 ; packuswb %xmm4,%xmm4 DB 102,15,126,224 ; movd %xmm4,%eax DB 102,66,137,4,2 ; mov %ax,(%rdx,%r8,1) - DB 235,163 ; jmp 298a5 <_sk_store_a8_sse2_8bit+0x5d> - DB 102,15,127,100,36,64 ; movdqa %xmm4,0x40(%rsp) - DB 138,68,36,76 ; mov 0x4c(%rsp),%al + DB 235,161 ; jmp 298a5 <_sk_store_a8_sse2_8bit+0x5d> + DB 102,15,127,100,36,32 ; movdqa %xmm4,0x20(%rsp) + DB 138,68,36,44 ; mov 0x2c(%rsp),%al DB 66,136,68,2,6 ; mov %al,0x6(%rdx,%r8,1) - DB 102,15,127,100,36,48 ; movdqa %xmm4,0x30(%rsp) - DB 138,68,36,58 ; mov 0x3a(%rsp),%al + DB 102,15,127,100,36,16 ; movdqa %xmm4,0x10(%rsp) + DB 138,68,36,26 ; mov 0x1a(%rsp),%al DB 66,136,68,2,5 ; mov %al,0x5(%rdx,%r8,1) - DB 102,15,127,100,36,32 ; movdqa %xmm4,0x20(%rsp) - DB 138,68,36,40 ; mov 0x28(%rsp),%al + DB 102,15,127,36,36 ; movdqa %xmm4,(%rsp) + DB 138,68,36,8 ; mov 0x8(%rsp),%al DB 66,136,68,2,4 ; mov %al,0x4(%rdx,%r8,1) - DB 102,15,219,37,25,49,0,0 ; pand 0x3119(%rip),%xmm4 # 2ca50 <_sk_overlay_sse2_8bit+0xe03> + DB 102,15,219,37,24,49,0,0 ; pand 0x3118(%rip),%xmm4 # 2ca50 <_sk_overlay_sse2_8bit+0xe03> DB 102,15,103,228 ; packuswb %xmm4,%xmm4 DB 102,66,15,126,36,2 ; movd %xmm4,(%rdx,%r8,1) - DB 233,95,255,255,255 ; jmpq 298a5 <_sk_store_a8_sse2_8bit+0x5d> - DB 102,144 ; xchg %ax,%ax + DB 233,94,255,255,255 ; jmpq 298a5 <_sk_store_a8_sse2_8bit+0x5d> + DB 144 ; nop DB 134,255 ; xchg %bh,%bh DB 255 ; (bad) - DB 255,163,255,255,255,148 ; jmpq *-0x6b000001(%rbx) + DB 255,165,255,255,255,150 ; jmpq *-0x69000001(%rbp) DB 255 ; (bad) DB 255 ; (bad) - DB 255,231 ; jmpq *%rdi - DB 255 ; (bad) DB 255 ; (bad) + DB 232,255,255,255,218 ; callq ffffffffdb029958 <_sk_overlay_sse2_8bit+0xffffffffdaffdd0b> DB 255 ; (bad) - DB 216,255 ; fdivr %st(7),%st DB 255 ; (bad) - DB 255,201 ; dec %ecx + DB 255,203 ; dec %ebx DB 255 ; (bad) DB 255 ; (bad) DB 255 ; (bad) - DB 186 ; .byte 0xba + DB 188 ; .byte 0xbc DB 255 ; (bad) DB 255 ; (bad) DB 255 ; .byte 0xff @@ -39917,9 +39915,9 @@ _sk_load_g8_sse2_8bit LABEL PROC DB 117,116 ; jne 299f5 <_sk_load_g8_sse2_8bit+0x91> DB 243,66,15,126,4,2 ; movq (%rdx,%r8,1),%xmm0 DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0 - DB 102,15,84,5,189,48,0,0 ; andpd 0x30bd(%rip),%xmm0 # 2ca50 <_sk_overlay_sse2_8bit+0xe03> + DB 102,15,219,5,189,48,0,0 ; pand 0x30bd(%rip),%xmm0 # 2ca50 <_sk_overlay_sse2_8bit+0xe03> DB 102,15,239,201 ; pxor %xmm1,%xmm1 - DB 102,15,40,224 ; movapd %xmm0,%xmm4 + DB 102,15,111,224 ; movdqa %xmm0,%xmm4 DB 102,15,97,225 ; punpcklwd %xmm1,%xmm4 DB 102,15,105,193 ; punpckhwd %xmm1,%xmm0 DB 102,15,111,45,245,55,0,0 ; movdqa 0x37f5(%rip),%xmm5 # 2d1a0 <_sk_overlay_sse2_8bit+0x1553> @@ -40007,9 +40005,9 @@ _sk_load_g8_dst_sse2_8bit LABEL PROC DB 117,116 ; jne 29b35 <_sk_load_g8_dst_sse2_8bit+0x91> DB 243,66,15,126,20,2 ; movq (%rdx,%r8,1),%xmm2 DB 102,15,96,208 ; punpcklbw %xmm0,%xmm2 - DB 102,15,84,21,125,47,0,0 ; andpd 0x2f7d(%rip),%xmm2 # 2ca50 <_sk_overlay_sse2_8bit+0xe03> + DB 102,15,219,21,125,47,0,0 ; pand 0x2f7d(%rip),%xmm2 # 2ca50 <_sk_overlay_sse2_8bit+0xe03> DB 102,15,239,219 ; pxor %xmm3,%xmm3 - DB 102,15,40,226 ; movapd %xmm2,%xmm4 + DB 102,15,111,226 ; movdqa %xmm2,%xmm4 DB 102,15,97,227 ; punpcklwd %xmm3,%xmm4 DB 102,15,105,211 ; punpckhwd %xmm3,%xmm2 DB 102,15,111,45,181,54,0,0 ; movdqa 0x36b5(%rip),%xmm5 # 2d1a0 <_sk_overlay_sse2_8bit+0x1553> @@ -40284,9 +40282,9 @@ _sk_scale_u8_sse2_8bit LABEL PROC DB 15,133,239,0,0,0 ; jne 29fe1 <_sk_scale_u8_sse2_8bit+0x110> DB 243,66,15,126,36,2 ; movq (%rdx,%r8,1),%xmm4 DB 102,15,96,224 ; punpcklbw %xmm0,%xmm4 - DB 102,15,84,37,76,43,0,0 ; andpd 0x2b4c(%rip),%xmm4 # 2ca50 <_sk_overlay_sse2_8bit+0xe03> + DB 102,15,219,37,76,43,0,0 ; pand 0x2b4c(%rip),%xmm4 # 2ca50 <_sk_overlay_sse2_8bit+0xe03> DB 102,69,15,239,192 ; pxor %xmm8,%xmm8 - DB 102,15,40,236 ; movapd %xmm4,%xmm5 + DB 102,15,111,236 ; movdqa %xmm4,%xmm5 DB 102,65,15,105,232 ; punpckhwd %xmm8,%xmm5 DB 102,65,15,97,224 ; punpcklwd %xmm8,%xmm4 DB 102,15,114,244,24 ; pslld $0x18,%xmm4 @@ -40470,9 +40468,9 @@ _sk_lerp_u8_sse2_8bit LABEL PROC DB 15,133,141,1,0,0 ; jne 2a378 <_sk_lerp_u8_sse2_8bit+0x1ae> DB 243,66,15,126,44,2 ; movq (%rdx,%r8,1),%xmm5 DB 102,15,96,232 ; punpcklbw %xmm0,%xmm5 - DB 102,15,84,45,83,40,0,0 ; andpd 0x2853(%rip),%xmm5 # 2ca50 <_sk_overlay_sse2_8bit+0xe03> + DB 102,15,219,45,83,40,0,0 ; pand 0x2853(%rip),%xmm5 # 2ca50 <_sk_overlay_sse2_8bit+0xe03> DB 102,69,15,239,192 ; pxor %xmm8,%xmm8 - DB 102,15,40,229 ; movapd %xmm5,%xmm4 + DB 102,15,111,229 ; movdqa %xmm5,%xmm4 DB 102,65,15,105,224 ; punpckhwd %xmm8,%xmm4 DB 102,65,15,97,232 ; punpcklwd %xmm8,%xmm5 DB 102,15,114,245,24 ; pslld $0x18,%xmm5 diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp index 5c73ea8cbe..0c019f8fbc 100644 --- a/src/jumper/SkJumper_stages_8bit.cpp +++ b/src/jumper/SkJumper_stages_8bit.cpp @@ -5,23 +5,27 @@ * found in the LICENSE file. */ +// This restricted SkJumper backend works on 8-bit per channel interlaced +// pixels. This is the natural format for kN32_SkColorType buffers, and we +// hope the stages in this file can replace many custom legacy routines. + #include "SkJumper.h" #include "SkJumper_misc.h" -#if defined(__SSE2__) +// As an experiment we bake ARMv8 8-bit code in as normally compiled Skia code. +// Any other platform (so far) is offline-only. +#if defined(JUMPER_IS_OFFLINE) || (defined(__clang__) && defined(__aarch64__)) + +#if defined(__aarch64__) + #include <arm_neon.h> +#else #include <immintrin.h> #endif -// This restricted SkJumper backend works on 8-bit per channel interlaced -// pixels. This is the natural format for kN32_SkColorType buffers, and we -// hope the stages in this file can replace many custom legacy routines. - #if !defined(JUMPER_IS_OFFLINE) - #error "This file must be pre-compiled." + #define WRAP(name) sk_##name##_8bit #elif defined(__aarch64__) #define WRAP(name) sk_##name##_aarch64_8bit -#elif defined(__arm__) - #define WRAP(name) sk_##name##_vfp4_8bit #elif defined(__AVX2__) #define WRAP(name) sk_##name##_hsw_8bit #elif defined(__SSE4_1__) @@ -112,7 +116,7 @@ SI V operator*(V x, V y) { template <typename T> SI T inv(T v) { return 0xff - v; } -SI V two(V v) { return v + v; } + SI V lerp(V from, V to, V t) { return to*t + from*inv(t); } SI V alpha(V v) { @@ -162,10 +166,13 @@ SI V saturated_add(V a, V b) { b_lo, b_hi; split(a.u8x4, &a_lo, &a_hi); split(b.u8x4, &b_lo, &b_hi); -#if defined(__AVX2__) +#if defined(__aarch64__) + return join(vqaddq_u8(a_lo, b_lo), + vqaddq_u8(a_hi, b_hi)); +#elif defined(__AVX2__) return join(_mm256_adds_epu8(a_lo, b_lo), _mm256_adds_epu8(a_hi, b_hi)); -#else +#elif defined(__SSE2__) return join(_mm_adds_epu8(a_lo, b_lo), _mm_adds_epu8(a_hi, b_hi)); #endif @@ -185,7 +192,11 @@ using Stage = void(const Params* params, void** program, R src_lo, R src_hi, R d MAYBE_MSABI extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit, void** program, const SkJumper_constants*) { - R r; +#if defined(JUMPER_IS_OFFLINE) + R r; // Fastest to start uninitialized. +#else + R r{}; // Next best is zero'd for compilers that will complain about uninitialized values. +#endif auto start = (Stage*)load_and_inc(program); for (; y < ylimit; y++) { Params params = { x,y,0 }; @@ -223,6 +234,7 @@ SI V load(const T* src, size_t tail) { if (__builtin_expect(tail, 0)) { V v = 0; switch (tail) { + #if defined(__AVX2__) case 15: v[14] = src[14]; case 14: v[13] = src[13]; case 13: v[12] = src[12]; @@ -231,6 +243,7 @@ SI V load(const T* src, size_t tail) { case 10: v[ 9] = src[ 9]; case 9: v[ 8] = src[ 8]; case 8: memcpy(&v, src, 8*sizeof(T)); break; + #endif case 7: v[6] = src[6]; case 6: v[5] = src[5]; case 5: v[4] = src[4]; @@ -249,6 +262,7 @@ SI void store(T* dst, V v, size_t tail) { __builtin_assume(tail < kStride); if (__builtin_expect(tail, 0)) { switch (tail) { + #if defined(__AVX2__) case 15: dst[14] = v[14]; case 14: dst[13] = v[13]; case 13: dst[12] = v[12]; @@ -257,6 +271,7 @@ SI void store(T* dst, V v, size_t tail) { case 10: dst[ 9] = v[ 9]; case 9: dst[ 8] = v[ 8]; case 8: memcpy(dst, &v, 8*sizeof(T)); break; + #endif case 7: dst[6] = v[6]; case 6: dst[5] = v[5]; case 5: dst[4] = v[4]; @@ -461,3 +476,5 @@ STAGE(overlay) { // colorburn | // colordodge > these involve division, which makes them (much) slower than the float stages. // softlight | + +#endif |