aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-08-29 08:40:48 -0400
committerGravatar Mike Klein <mtklein@chromium.org>2017-08-29 17:04:47 +0000
commit9d7e57d509149dd2fcb3ba73ea8f4cdce11f84bd (patch)
tree5442beb60c037b62ebc9477742d6490fb6dcac20 /src/jumper
parent6d13575108299951ecdfba6d85c915fcec2bc028 (diff)
Revert "Revert "8-bit jumper on armv8""
This reverts commit 6d13575108299951ecdfba6d85c915fcec2bc028. Now with guards for "errors" like this: external/skia/src/jumper/SkJumper_stages_8bit.cpp:240:50: error: 'memcpy' called with size bigger than buffer case 12: memcpy(&v, src, 12*sizeof(T)); break; This code is unreachable and generally removed by Clang's optimizer anyway... as far as I can tell the code generation diff is arbitrary. Change-Id: I6216567caaa6166f71258bd25343a09e93892a10 Reviewed-on: https://skia-review.googlesource.com/39961 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/jumper')
-rw-r--r--src/jumper/SkJumper.cpp42
-rw-r--r--src/jumper/SkJumper_generated.S44
-rw-r--r--src/jumper/SkJumper_generated_win.S66
-rw-r--r--src/jumper/SkJumper_stages_8bit.cpp41
4 files changed, 118 insertions, 75 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 9f8e970f32..315110faf2 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -110,7 +110,7 @@ using StartPipelineFn = void(size_t,size_t,size_t,size_t, void**,K*);
extern "C" {
#if __has_feature(memory_sanitizer)
- // We'll just run portable code.
+ // We'll just run baseline code.
#elif defined(__arm__)
StartPipelineFn ASM(start_pipeline,vfp4);
@@ -168,12 +168,22 @@ extern "C" {
#endif
- // Portable, single-pixel stages.
+ // Baseline code compiled as a normal part of Skia.
StartPipelineFn sk_start_pipeline;
StageFn sk_just_return;
#define M(st) StageFn sk_##st;
SK_RASTER_PIPELINE_STAGES(M)
#undef M
+
+#if defined(__clang__) && defined(__aarch64__)
+ // We also compile 8-bit stages on ARMv8 as a normal part of Skia when compiled with Clang.
+ StartPipelineFn sk_start_pipeline_8bit;
+ StageFn sk_just_return_8bit;
+ #define M(st) StageFn sk_##st##_8bit;
+ SK_RASTER_PIPELINE_STAGES(M)
+ #undef M
+#endif
+
}
#if !__has_feature(memory_sanitizer) && (defined(__x86_64__) || defined(_M_X64))
@@ -198,6 +208,16 @@ extern "C" {
}
LOWP_STAGES(M)
#undef M
+#elif defined(__clang__) && defined(__aarch64__)
+ template <SkRasterPipeline::StockStage st>
+ static constexpr StageFn* aarch64_8bit() { return nullptr; }
+
+ #define M(st) \
+ template <> constexpr StageFn* aarch64_8bit<SkRasterPipeline::st>() { \
+ return sk_##st##_8bit; \
+ }
+ LOWP_STAGES(M)
+ #undef M
#endif
// Engines comprise everything we need to run SkRasterPipelines.
@@ -207,20 +227,20 @@ struct SkJumper_Engine {
StageFn* just_return;
};
-// We'll default to this portable engine, but try to choose a better one at runtime.
-static const SkJumper_Engine kPortable = {
+// We'll default to this baseline engine, but try to choose a better one at runtime.
+static const SkJumper_Engine kBaseline = {
#define M(stage) sk_##stage,
{ SK_RASTER_PIPELINE_STAGES(M) },
#undef M
sk_start_pipeline,
sk_just_return,
};
-static SkJumper_Engine gEngine = kPortable;
+static SkJumper_Engine gEngine = kBaseline;
static SkOnce gChooseEngineOnce;
static SkJumper_Engine choose_engine() {
#if __has_feature(memory_sanitizer)
- // We'll just run portable code.
+ // We'll just run baseline code.
#elif defined(__arm__)
if (1 && SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
@@ -283,7 +303,7 @@ static SkJumper_Engine choose_engine() {
}
#endif
- return kPortable;
+ return kBaseline;
}
#ifndef SK_JUMPER_DISABLE_8BIT
@@ -326,6 +346,14 @@ static SkJumper_Engine choose_engine() {
#undef M
};
}
+ #elif defined(__clang__) && defined(__aarch64__)
+ return {
+ #define M(st) aarch64_8bit<SkRasterPipeline::st>(),
+ { SK_RASTER_PIPELINE_STAGES(M) },
+ sk_start_pipeline_8bit,
+ sk_just_return_8bit,
+ #undef M
+ };
#endif
return kNone;
}
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index eeb3a88d77..465095b67f 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -50207,9 +50207,9 @@ _sk_load_a8_sse2_8bit:
.byte 117,48 // jne 28f81 <_sk_load_a8_sse2_8bit+0x4d>
.byte 243,66,15,126,4,2 // movq (%rdx,%r8,1),%xmm0
.byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
- .byte 102,15,84,5,109,51,0,0 // andpd 0x336d(%rip),%xmm0 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
+ .byte 102,15,219,5,109,51,0,0 // pand 0x336d(%rip),%xmm0 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
.byte 102,15,239,228 // pxor %xmm4,%xmm4
- .byte 102,15,40,200 // movapd %xmm0,%xmm1
+ .byte 102,15,111,200 // movdqa %xmm0,%xmm1
.byte 102,15,105,204 // punpckhwd %xmm4,%xmm1
.byte 102,15,97,196 // punpcklwd %xmm4,%xmm0
.byte 102,15,114,240,24 // pslld $0x18,%xmm0
@@ -50284,9 +50284,9 @@ _sk_load_a8_dst_sse2_8bit:
.byte 117,48 // jne 29075 <_sk_load_a8_dst_sse2_8bit+0x4d>
.byte 243,66,15,126,20,2 // movq (%rdx,%r8,1),%xmm2
.byte 102,15,96,208 // punpcklbw %xmm0,%xmm2
- .byte 102,15,84,21,121,50,0,0 // andpd 0x3279(%rip),%xmm2 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
+ .byte 102,15,219,21,121,50,0,0 // pand 0x3279(%rip),%xmm2 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
.byte 102,15,239,228 // pxor %xmm4,%xmm4
- .byte 102,15,40,218 // movapd %xmm2,%xmm3
+ .byte 102,15,111,218 // movdqa %xmm2,%xmm3
.byte 102,15,105,220 // punpckhwd %xmm4,%xmm3
.byte 102,15,97,212 // punpcklwd %xmm4,%xmm2
.byte 102,15,114,242,24 // pslld $0x18,%xmm2
@@ -50382,26 +50382,26 @@ _sk_store_a8_sse2_8bit:
.byte 72,99,4,129 // movslq (%rcx,%rax,4),%rax
.byte 72,1,200 // add %rcx,%rax
.byte 255,224 // jmpq *%rax
- .byte 102,15,127,100,36,168 // movdqa %xmm4,-0x58(%rsp)
- .byte 138,68,36,168 // mov -0x58(%rsp),%al
+ .byte 102,15,127,100,36,232 // movdqa %xmm4,-0x18(%rsp)
+ .byte 138,68,36,232 // mov -0x18(%rsp),%al
.byte 66,136,4,2 // mov %al,(%rdx,%r8,1)
.byte 235,203 // jmp 29175 <_sk_store_a8_sse2_8bit+0x59>
- .byte 102,15,127,100,36,184 // movdqa %xmm4,-0x48(%rsp)
- .byte 138,68,36,188 // mov -0x44(%rsp),%al
+ .byte 102,15,127,100,36,216 // movdqa %xmm4,-0x28(%rsp)
+ .byte 138,68,36,220 // mov -0x24(%rsp),%al
.byte 66,136,68,2,2 // mov %al,0x2(%rdx,%r8,1)
.byte 102,15,219,37,15,49,0,0 // pand 0x310f(%rip),%xmm4 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
.byte 102,15,103,228 // packuswb %xmm4,%xmm4
.byte 102,15,126,224 // movd %xmm4,%eax
.byte 102,66,137,4,2 // mov %ax,(%rdx,%r8,1)
.byte 235,165 // jmp 29175 <_sk_store_a8_sse2_8bit+0x59>
- .byte 102,15,127,100,36,232 // movdqa %xmm4,-0x18(%rsp)
- .byte 138,68,36,244 // mov -0xc(%rsp),%al
+ .byte 102,15,127,100,36,200 // movdqa %xmm4,-0x38(%rsp)
+ .byte 138,68,36,212 // mov -0x2c(%rsp),%al
.byte 66,136,68,2,6 // mov %al,0x6(%rdx,%r8,1)
- .byte 102,15,127,100,36,216 // movdqa %xmm4,-0x28(%rsp)
- .byte 138,68,36,226 // mov -0x1e(%rsp),%al
+ .byte 102,15,127,100,36,184 // movdqa %xmm4,-0x48(%rsp)
+ .byte 138,68,36,194 // mov -0x3e(%rsp),%al
.byte 66,136,68,2,5 // mov %al,0x5(%rdx,%r8,1)
- .byte 102,15,127,100,36,200 // movdqa %xmm4,-0x38(%rsp)
- .byte 138,68,36,208 // mov -0x30(%rsp),%al
+ .byte 102,15,127,100,36,168 // movdqa %xmm4,-0x58(%rsp)
+ .byte 138,68,36,176 // mov -0x50(%rsp),%al
.byte 66,136,68,2,4 // mov %al,0x4(%rdx,%r8,1)
.byte 102,15,219,37,203,48,0,0 // pand 0x30cb(%rip),%xmm4 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
.byte 102,15,103,228 // packuswb %xmm4,%xmm4
@@ -50440,9 +50440,9 @@ _sk_load_g8_sse2_8bit:
.byte 117,116 // jne 292c1 <_sk_load_g8_sse2_8bit+0x91>
.byte 243,66,15,126,4,2 // movq (%rdx,%r8,1),%xmm0
.byte 102,15,96,192 // punpcklbw %xmm0,%xmm0
- .byte 102,15,84,5,113,48,0,0 // andpd 0x3071(%rip),%xmm0 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
+ .byte 102,15,219,5,113,48,0,0 // pand 0x3071(%rip),%xmm0 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
.byte 102,15,239,201 // pxor %xmm1,%xmm1
- .byte 102,15,40,224 // movapd %xmm0,%xmm4
+ .byte 102,15,111,224 // movdqa %xmm0,%xmm4
.byte 102,15,97,225 // punpcklwd %xmm1,%xmm4
.byte 102,15,105,193 // punpckhwd %xmm1,%xmm0
.byte 102,15,111,45,169,55,0,0 // movdqa 0x37a9(%rip),%xmm5 # 2ca20 <_sk_overlay_sse2_8bit+0x153b>
@@ -50532,9 +50532,9 @@ _sk_load_g8_dst_sse2_8bit:
.byte 117,116 // jne 29401 <_sk_load_g8_dst_sse2_8bit+0x91>
.byte 243,66,15,126,20,2 // movq (%rdx,%r8,1),%xmm2
.byte 102,15,96,208 // punpcklbw %xmm0,%xmm2
- .byte 102,15,84,21,49,47,0,0 // andpd 0x2f31(%rip),%xmm2 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
+ .byte 102,15,219,21,49,47,0,0 // pand 0x2f31(%rip),%xmm2 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
.byte 102,15,239,219 // pxor %xmm3,%xmm3
- .byte 102,15,40,226 // movapd %xmm2,%xmm4
+ .byte 102,15,111,226 // movdqa %xmm2,%xmm4
.byte 102,15,97,227 // punpcklwd %xmm3,%xmm4
.byte 102,15,105,211 // punpckhwd %xmm3,%xmm2
.byte 102,15,111,45,105,54,0,0 // movdqa 0x3669(%rip),%xmm5 # 2ca20 <_sk_overlay_sse2_8bit+0x153b>
@@ -50815,9 +50815,9 @@ _sk_scale_u8_sse2_8bit:
.byte 15,133,239,0,0,0 // jne 298ad <_sk_scale_u8_sse2_8bit+0x110>
.byte 243,66,15,126,36,2 // movq (%rdx,%r8,1),%xmm4
.byte 102,15,96,224 // punpcklbw %xmm0,%xmm4
- .byte 102,15,84,37,0,43,0,0 // andpd 0x2b00(%rip),%xmm4 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
+ .byte 102,15,219,37,0,43,0,0 // pand 0x2b00(%rip),%xmm4 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
.byte 102,69,15,239,192 // pxor %xmm8,%xmm8
- .byte 102,15,40,236 // movapd %xmm4,%xmm5
+ .byte 102,15,111,236 // movdqa %xmm4,%xmm5
.byte 102,65,15,105,232 // punpckhwd %xmm8,%xmm5
.byte 102,65,15,97,224 // punpcklwd %xmm8,%xmm4
.byte 102,15,114,244,24 // pslld $0x18,%xmm4
@@ -51005,9 +51005,9 @@ _sk_lerp_u8_sse2_8bit:
.byte 15,133,141,1,0,0 // jne 29c44 <_sk_lerp_u8_sse2_8bit+0x1ae>
.byte 243,66,15,126,44,2 // movq (%rdx,%r8,1),%xmm5
.byte 102,15,96,232 // punpcklbw %xmm0,%xmm5
- .byte 102,15,84,45,7,40,0,0 // andpd 0x2807(%rip),%xmm5 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
+ .byte 102,15,219,45,7,40,0,0 // pand 0x2807(%rip),%xmm5 # 2c2d0 <_sk_overlay_sse2_8bit+0xdeb>
.byte 102,69,15,239,192 // pxor %xmm8,%xmm8
- .byte 102,15,40,229 // movapd %xmm5,%xmm4
+ .byte 102,15,111,229 // movdqa %xmm5,%xmm4
.byte 102,65,15,105,224 // punpckhwd %xmm8,%xmm4
.byte 102,65,15,97,232 // punpcklwd %xmm8,%xmm5
.byte 102,15,114,245,24 // pslld $0x18,%xmm5
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index 99ec6b9fa9..d85a0de655 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -39685,9 +39685,9 @@ _sk_load_a8_sse2_8bit LABEL PROC
DB 117,48 ; jne 296ad <_sk_load_a8_sse2_8bit+0x4d>
DB 243,66,15,126,4,2 ; movq (%rdx,%r8,1),%xmm0
DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
- DB 102,15,84,5,193,51,0,0 ; andpd 0x33c1(%rip),%xmm0 # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
+ DB 102,15,219,5,193,51,0,0 ; pand 0x33c1(%rip),%xmm0 # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
DB 102,15,239,228 ; pxor %xmm4,%xmm4
- DB 102,15,40,200 ; movapd %xmm0,%xmm1
+ DB 102,15,111,200 ; movdqa %xmm0,%xmm1
DB 102,15,105,204 ; punpckhwd %xmm4,%xmm1
DB 102,15,97,196 ; punpcklwd %xmm4,%xmm0
DB 102,15,114,240,24 ; pslld $0x18,%xmm0
@@ -39760,9 +39760,9 @@ _sk_load_a8_dst_sse2_8bit LABEL PROC
DB 117,48 ; jne 297a1 <_sk_load_a8_dst_sse2_8bit+0x4d>
DB 243,66,15,126,20,2 ; movq (%rdx,%r8,1),%xmm2
DB 102,15,96,208 ; punpcklbw %xmm0,%xmm2
- DB 102,15,84,21,205,50,0,0 ; andpd 0x32cd(%rip),%xmm2 # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
+ DB 102,15,219,21,205,50,0,0 ; pand 0x32cd(%rip),%xmm2 # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
DB 102,15,239,228 ; pxor %xmm4,%xmm4
- DB 102,15,40,218 ; movapd %xmm2,%xmm3
+ DB 102,15,111,218 ; movdqa %xmm2,%xmm3
DB 102,15,105,220 ; punpckhwd %xmm4,%xmm3
DB 102,15,97,212 ; punpcklwd %xmm4,%xmm2
DB 102,15,114,242,24 ; pslld $0x18,%xmm2
@@ -39858,48 +39858,46 @@ _sk_store_a8_sse2_8bit LABEL PROC
DB 72,99,4,129 ; movslq (%rcx,%rax,4),%rax
DB 72,1,200 ; add %rcx,%rax
DB 255,224 ; jmpq *%rax
- DB 102,15,127,36,36 ; movdqa %xmm4,(%rsp)
- DB 138,4,36 ; mov (%rsp),%al
+ DB 102,15,127,100,36,64 ; movdqa %xmm4,0x40(%rsp)
+ DB 138,68,36,64 ; mov 0x40(%rsp),%al
DB 66,136,4,2 ; mov %al,(%rdx,%r8,1)
- DB 235,201 ; jmp 298a5 <_sk_store_a8_sse2_8bit+0x5d>
- DB 102,15,127,100,36,16 ; movdqa %xmm4,0x10(%rsp)
- DB 138,68,36,20 ; mov 0x14(%rsp),%al
+ DB 235,199 ; jmp 298a5 <_sk_store_a8_sse2_8bit+0x5d>
+ DB 102,15,127,100,36,48 ; movdqa %xmm4,0x30(%rsp)
+ DB 138,68,36,52 ; mov 0x34(%rsp),%al
DB 66,136,68,2,2 ; mov %al,0x2(%rdx,%r8,1)
- DB 102,15,219,37,93,49,0,0 ; pand 0x315d(%rip),%xmm4 # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
+ DB 102,15,219,37,91,49,0,0 ; pand 0x315b(%rip),%xmm4 # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
DB 102,15,103,228 ; packuswb %xmm4,%xmm4
DB 102,15,126,224 ; movd %xmm4,%eax
DB 102,66,137,4,2 ; mov %ax,(%rdx,%r8,1)
- DB 235,163 ; jmp 298a5 <_sk_store_a8_sse2_8bit+0x5d>
- DB 102,15,127,100,36,64 ; movdqa %xmm4,0x40(%rsp)
- DB 138,68,36,76 ; mov 0x4c(%rsp),%al
+ DB 235,161 ; jmp 298a5 <_sk_store_a8_sse2_8bit+0x5d>
+ DB 102,15,127,100,36,32 ; movdqa %xmm4,0x20(%rsp)
+ DB 138,68,36,44 ; mov 0x2c(%rsp),%al
DB 66,136,68,2,6 ; mov %al,0x6(%rdx,%r8,1)
- DB 102,15,127,100,36,48 ; movdqa %xmm4,0x30(%rsp)
- DB 138,68,36,58 ; mov 0x3a(%rsp),%al
+ DB 102,15,127,100,36,16 ; movdqa %xmm4,0x10(%rsp)
+ DB 138,68,36,26 ; mov 0x1a(%rsp),%al
DB 66,136,68,2,5 ; mov %al,0x5(%rdx,%r8,1)
- DB 102,15,127,100,36,32 ; movdqa %xmm4,0x20(%rsp)
- DB 138,68,36,40 ; mov 0x28(%rsp),%al
+ DB 102,15,127,36,36 ; movdqa %xmm4,(%rsp)
+ DB 138,68,36,8 ; mov 0x8(%rsp),%al
DB 66,136,68,2,4 ; mov %al,0x4(%rdx,%r8,1)
- DB 102,15,219,37,25,49,0,0 ; pand 0x3119(%rip),%xmm4 # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
+ DB 102,15,219,37,24,49,0,0 ; pand 0x3118(%rip),%xmm4 # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
DB 102,15,103,228 ; packuswb %xmm4,%xmm4
DB 102,66,15,126,36,2 ; movd %xmm4,(%rdx,%r8,1)
- DB 233,95,255,255,255 ; jmpq 298a5 <_sk_store_a8_sse2_8bit+0x5d>
- DB 102,144 ; xchg %ax,%ax
+ DB 233,94,255,255,255 ; jmpq 298a5 <_sk_store_a8_sse2_8bit+0x5d>
+ DB 144 ; nop
DB 134,255 ; xchg %bh,%bh
DB 255 ; (bad)
- DB 255,163,255,255,255,148 ; jmpq *-0x6b000001(%rbx)
+ DB 255,165,255,255,255,150 ; jmpq *-0x69000001(%rbp)
DB 255 ; (bad)
DB 255 ; (bad)
- DB 255,231 ; jmpq *%rdi
- DB 255 ; (bad)
DB 255 ; (bad)
+ DB 232,255,255,255,218 ; callq ffffffffdb029958 <_sk_overlay_sse2_8bit+0xffffffffdaffdd0b>
DB 255 ; (bad)
- DB 216,255 ; fdivr %st(7),%st
DB 255 ; (bad)
- DB 255,201 ; dec %ecx
+ DB 255,203 ; dec %ebx
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; (bad)
- DB 186 ; .byte 0xba
+ DB 188 ; .byte 0xbc
DB 255 ; (bad)
DB 255 ; (bad)
DB 255 ; .byte 0xff
@@ -39917,9 +39915,9 @@ _sk_load_g8_sse2_8bit LABEL PROC
DB 117,116 ; jne 299f5 <_sk_load_g8_sse2_8bit+0x91>
DB 243,66,15,126,4,2 ; movq (%rdx,%r8,1),%xmm0
DB 102,15,96,192 ; punpcklbw %xmm0,%xmm0
- DB 102,15,84,5,189,48,0,0 ; andpd 0x30bd(%rip),%xmm0 # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
+ DB 102,15,219,5,189,48,0,0 ; pand 0x30bd(%rip),%xmm0 # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
DB 102,15,239,201 ; pxor %xmm1,%xmm1
- DB 102,15,40,224 ; movapd %xmm0,%xmm4
+ DB 102,15,111,224 ; movdqa %xmm0,%xmm4
DB 102,15,97,225 ; punpcklwd %xmm1,%xmm4
DB 102,15,105,193 ; punpckhwd %xmm1,%xmm0
DB 102,15,111,45,245,55,0,0 ; movdqa 0x37f5(%rip),%xmm5 # 2d1a0 <_sk_overlay_sse2_8bit+0x1553>
@@ -40007,9 +40005,9 @@ _sk_load_g8_dst_sse2_8bit LABEL PROC
DB 117,116 ; jne 29b35 <_sk_load_g8_dst_sse2_8bit+0x91>
DB 243,66,15,126,20,2 ; movq (%rdx,%r8,1),%xmm2
DB 102,15,96,208 ; punpcklbw %xmm0,%xmm2
- DB 102,15,84,21,125,47,0,0 ; andpd 0x2f7d(%rip),%xmm2 # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
+ DB 102,15,219,21,125,47,0,0 ; pand 0x2f7d(%rip),%xmm2 # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
DB 102,15,239,219 ; pxor %xmm3,%xmm3
- DB 102,15,40,226 ; movapd %xmm2,%xmm4
+ DB 102,15,111,226 ; movdqa %xmm2,%xmm4
DB 102,15,97,227 ; punpcklwd %xmm3,%xmm4
DB 102,15,105,211 ; punpckhwd %xmm3,%xmm2
DB 102,15,111,45,181,54,0,0 ; movdqa 0x36b5(%rip),%xmm5 # 2d1a0 <_sk_overlay_sse2_8bit+0x1553>
@@ -40284,9 +40282,9 @@ _sk_scale_u8_sse2_8bit LABEL PROC
DB 15,133,239,0,0,0 ; jne 29fe1 <_sk_scale_u8_sse2_8bit+0x110>
DB 243,66,15,126,36,2 ; movq (%rdx,%r8,1),%xmm4
DB 102,15,96,224 ; punpcklbw %xmm0,%xmm4
- DB 102,15,84,37,76,43,0,0 ; andpd 0x2b4c(%rip),%xmm4 # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
+ DB 102,15,219,37,76,43,0,0 ; pand 0x2b4c(%rip),%xmm4 # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
- DB 102,15,40,236 ; movapd %xmm4,%xmm5
+ DB 102,15,111,236 ; movdqa %xmm4,%xmm5
DB 102,65,15,105,232 ; punpckhwd %xmm8,%xmm5
DB 102,65,15,97,224 ; punpcklwd %xmm8,%xmm4
DB 102,15,114,244,24 ; pslld $0x18,%xmm4
@@ -40470,9 +40468,9 @@ _sk_lerp_u8_sse2_8bit LABEL PROC
DB 15,133,141,1,0,0 ; jne 2a378 <_sk_lerp_u8_sse2_8bit+0x1ae>
DB 243,66,15,126,44,2 ; movq (%rdx,%r8,1),%xmm5
DB 102,15,96,232 ; punpcklbw %xmm0,%xmm5
- DB 102,15,84,45,83,40,0,0 ; andpd 0x2853(%rip),%xmm5 # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
+ DB 102,15,219,45,83,40,0,0 ; pand 0x2853(%rip),%xmm5 # 2ca50 <_sk_overlay_sse2_8bit+0xe03>
DB 102,69,15,239,192 ; pxor %xmm8,%xmm8
- DB 102,15,40,229 ; movapd %xmm5,%xmm4
+ DB 102,15,111,229 ; movdqa %xmm5,%xmm4
DB 102,65,15,105,224 ; punpckhwd %xmm8,%xmm4
DB 102,65,15,97,232 ; punpcklwd %xmm8,%xmm5
DB 102,15,114,245,24 ; pslld $0x18,%xmm5
diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp
index 5c73ea8cbe..0c019f8fbc 100644
--- a/src/jumper/SkJumper_stages_8bit.cpp
+++ b/src/jumper/SkJumper_stages_8bit.cpp
@@ -5,23 +5,27 @@
* found in the LICENSE file.
*/
+// This restricted SkJumper backend works on 8-bit per channel interlaced
+// pixels. This is the natural format for kN32_SkColorType buffers, and we
+// hope the stages in this file can replace many custom legacy routines.
+
#include "SkJumper.h"
#include "SkJumper_misc.h"
-#if defined(__SSE2__)
+// As an experiment we bake ARMv8 8-bit code in as normally compiled Skia code.
+// Any other platform (so far) is offline-only.
+#if defined(JUMPER_IS_OFFLINE) || (defined(__clang__) && defined(__aarch64__))
+
+#if defined(__aarch64__)
+ #include <arm_neon.h>
+#else
#include <immintrin.h>
#endif
-// This restricted SkJumper backend works on 8-bit per channel interlaced
-// pixels. This is the natural format for kN32_SkColorType buffers, and we
-// hope the stages in this file can replace many custom legacy routines.
-
#if !defined(JUMPER_IS_OFFLINE)
- #error "This file must be pre-compiled."
+ #define WRAP(name) sk_##name##_8bit
#elif defined(__aarch64__)
#define WRAP(name) sk_##name##_aarch64_8bit
-#elif defined(__arm__)
- #define WRAP(name) sk_##name##_vfp4_8bit
#elif defined(__AVX2__)
#define WRAP(name) sk_##name##_hsw_8bit
#elif defined(__SSE4_1__)
@@ -112,7 +116,7 @@ SI V operator*(V x, V y) {
template <typename T>
SI T inv(T v) { return 0xff - v; }
-SI V two(V v) { return v + v; }
+
SI V lerp(V from, V to, V t) { return to*t + from*inv(t); }
SI V alpha(V v) {
@@ -162,10 +166,13 @@ SI V saturated_add(V a, V b) {
b_lo, b_hi;
split(a.u8x4, &a_lo, &a_hi);
split(b.u8x4, &b_lo, &b_hi);
-#if defined(__AVX2__)
+#if defined(__aarch64__)
+ return join(vqaddq_u8(a_lo, b_lo),
+ vqaddq_u8(a_hi, b_hi));
+#elif defined(__AVX2__)
return join(_mm256_adds_epu8(a_lo, b_lo),
_mm256_adds_epu8(a_hi, b_hi));
-#else
+#elif defined(__SSE2__)
return join(_mm_adds_epu8(a_lo, b_lo),
_mm_adds_epu8(a_hi, b_hi));
#endif
@@ -185,7 +192,11 @@ using Stage = void(const Params* params, void** program, R src_lo, R src_hi, R d
MAYBE_MSABI
extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit,
void** program, const SkJumper_constants*) {
- R r;
+#if defined(JUMPER_IS_OFFLINE)
+ R r; // Fastest to start uninitialized.
+#else
+ R r{}; // Next best is zero'd for compilers that will complain about uninitialized values.
+#endif
auto start = (Stage*)load_and_inc(program);
for (; y < ylimit; y++) {
Params params = { x,y,0 };
@@ -223,6 +234,7 @@ SI V load(const T* src, size_t tail) {
if (__builtin_expect(tail, 0)) {
V v = 0;
switch (tail) {
+ #if defined(__AVX2__)
case 15: v[14] = src[14];
case 14: v[13] = src[13];
case 13: v[12] = src[12];
@@ -231,6 +243,7 @@ SI V load(const T* src, size_t tail) {
case 10: v[ 9] = src[ 9];
case 9: v[ 8] = src[ 8];
case 8: memcpy(&v, src, 8*sizeof(T)); break;
+ #endif
case 7: v[6] = src[6];
case 6: v[5] = src[5];
case 5: v[4] = src[4];
@@ -249,6 +262,7 @@ SI void store(T* dst, V v, size_t tail) {
__builtin_assume(tail < kStride);
if (__builtin_expect(tail, 0)) {
switch (tail) {
+ #if defined(__AVX2__)
case 15: dst[14] = v[14];
case 14: dst[13] = v[13];
case 13: dst[12] = v[12];
@@ -257,6 +271,7 @@ SI void store(T* dst, V v, size_t tail) {
case 10: dst[ 9] = v[ 9];
case 9: dst[ 8] = v[ 8];
case 8: memcpy(dst, &v, 8*sizeof(T)); break;
+ #endif
case 7: dst[6] = v[6];
case 6: dst[5] = v[5];
case 5: dst[4] = v[4];
@@ -461,3 +476,5 @@ STAGE(overlay) {
// colorburn |
// colordodge > these involve division, which makes them (much) slower than the float stages.
// softlight |
+
+#endif