diff options
author | 2017-06-05 12:09:47 -0400 | |
---|---|---|
committer | 2017-06-05 16:32:54 +0000 | |
commit | 5adb01bf0d190b20abe50fac608f396c65993424 (patch) | |
tree | 70086fbd3072a448eed3352d1641ad4fc6b5a7ca /src | |
parent | 1f29bf093f01e9c9bf79cbd2ac27da62efc8e3a4 (diff) |
lowp: add move_src_dst and srcover
This is enough to run the bench SkRasterPipeline_compile.
$ ninja -C out monobench; and out/monobench SkRasterPipeline_compile 300
Before: 300 SkRasterPipeline_compile 48.4858ns
After: 300 SkRasterPipeline_compile 37.5801ns
Change-Id: Icb80348908dfb016826700a44566222c9f7a853c
Reviewed-on: https://skia-review.googlesource.com/18595
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src')
-rw-r--r-- | src/jumper/SkJumper.cpp | 16 | ||||
-rw-r--r-- | src/jumper/SkJumper_generated.S | 48 | ||||
-rw-r--r-- | src/jumper/SkJumper_generated_win.S | 44 | ||||
-rw-r--r-- | src/jumper/SkJumper_stages_lowp.cpp | 25 |
4 files changed, 110 insertions, 23 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp index 7059822647..535e4a883b 100644 --- a/src/jumper/SkJumper.cpp +++ b/src/jumper/SkJumper.cpp @@ -83,9 +83,11 @@ extern "C" { SK_RASTER_PIPELINE_STAGES(M) #undef M - StageFn ASM(load_8888, ssse3_lowp), - ASM(store_8888, ssse3_lowp), - ASM(swap_rb, ssse3_lowp); + StageFn ASM(load_8888, ssse3_lowp), + ASM(store_8888, ssse3_lowp), + ASM(swap_rb, ssse3_lowp), + ASM(move_src_dst, ssse3_lowp), + ASM(srcover, ssse3_lowp); #endif // Portable, single-pixel stages. @@ -186,9 +188,11 @@ StartPipelineFn* SkRasterPipeline::build_pipeline(void** ip) const { for (const StageList* st = fStages; st; st = st->prev) { StageFn* fn = nullptr; switch (st->stage) { - case SkRasterPipeline::load_8888: fn = ASM(load_8888, ssse3_lowp); break; - case SkRasterPipeline::store_8888: fn = ASM(store_8888,ssse3_lowp); break; - case SkRasterPipeline::swap_rb: fn = ASM(swap_rb, ssse3_lowp); break; + case SkRasterPipeline::load_8888: fn = ASM(load_8888, ssse3_lowp); break; + case SkRasterPipeline::store_8888: fn = ASM(store_8888, ssse3_lowp); break; + case SkRasterPipeline::swap_rb: fn = ASM(swap_rb, ssse3_lowp); break; + case SkRasterPipeline::move_src_dst: fn = ASM(move_src_dst, ssse3_lowp); break; + case SkRasterPipeline::srcover: fn = ASM(srcover, ssse3_lowp); break; default: //SkDebugf("can't %d\n", st->stage); ip = reset_point; diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S index 52bf029be2..de0fb731a0 100644 --- a/src/jumper/SkJumper_generated.S +++ b/src/jumper/SkJumper_generated.S @@ -36953,7 +36953,7 @@ _sk_load_8888_ssse3_lowp: .byte 117,113 // jne 127 <_sk_load_8888_ssse3_lowp+0x7b> .byte 69,15,16,76,147,16 // movups 0x10(%r11,%rdx,4),%xmm9 .byte 69,15,16,4,147 // movups (%r11,%rdx,4),%xmm8 - .byte 102,15,111,5,55,2,0,0 // movdqa 0x237(%rip),%xmm0 # 300 <_sk_swap_rb_ssse3_lowp+0x14> + .byte 102,15,111,5,167,2,0,0 // movdqa 0x2a7(%rip),%xmm0 # 370 <_sk_srcover_ssse3_lowp+0x65> .byte 102,68,15,56,0,192 // pshufb %xmm0,%xmm8 .byte 102,68,15,56,0,200 // pshufb %xmm0,%xmm9 .byte 102,65,15,111,208 // movdqa %xmm8,%xmm2 @@ -36967,7 +36967,7 @@ _sk_load_8888_ssse3_lowp: .byte 102,15,239,210 // pxor %xmm2,%xmm2 .byte 102,65,15,96,208 // punpcklbw %xmm8,%xmm2 .byte 102,65,15,104,216 // punpckhbw %xmm8,%xmm3 - .byte 102,68,15,111,5,1,2,0,0 // movdqa 0x201(%rip),%xmm8 # 310 <_sk_swap_rb_ssse3_lowp+0x24> + .byte 102,68,15,111,5,113,2,0,0 // movdqa 0x271(%rip),%xmm8 # 380 <_sk_srcover_ssse3_lowp+0x75> .byte 102,65,15,228,192 // pmulhuw %xmm8,%xmm0 .byte 102,65,15,228,200 // pmulhuw %xmm8,%xmm1 .byte 102,65,15,228,208 // pmulhuw %xmm8,%xmm2 @@ -37109,6 +37109,41 @@ _sk_swap_rb_ssse3_lowp: .byte 65,15,40,208 // movaps %xmm8,%xmm2 .byte 255,224 // jmpq *%rax +HIDDEN _sk_move_src_dst_ssse3_lowp +.globl _sk_move_src_dst_ssse3_lowp +FUNCTION(_sk_move_src_dst_ssse3_lowp) +_sk_move_src_dst_ssse3_lowp: + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 15,40,224 // movaps %xmm0,%xmm4 + .byte 15,40,233 // movaps %xmm1,%xmm5 + .byte 15,40,242 // movaps %xmm2,%xmm6 + .byte 15,40,251 // movaps %xmm3,%xmm7 + .byte 255,224 // jmpq *%rax + +HIDDEN _sk_srcover_ssse3_lowp +.globl _sk_srcover_ssse3_lowp +FUNCTION(_sk_srcover_ssse3_lowp) +_sk_srcover_ssse3_lowp: + .byte 102,68,15,111,5,124,0,0,0 // movdqa 0x7c(%rip),%xmm8 # 390 <_sk_srcover_ssse3_lowp+0x85> + .byte 102,68,15,249,195 // psubw %xmm3,%xmm8 + .byte 102,68,15,111,204 // movdqa %xmm4,%xmm9 + .byte 102,69,15,56,11,200 // pmulhrsw %xmm8,%xmm9 + .byte 102,69,15,56,29,201 // pabsw %xmm9,%xmm9 + .byte 102,65,15,253,193 // paddw %xmm9,%xmm0 + .byte 102,68,15,111,205 // movdqa %xmm5,%xmm9 + .byte 102,69,15,56,11,200 // pmulhrsw %xmm8,%xmm9 + .byte 102,69,15,56,29,201 // pabsw %xmm9,%xmm9 + .byte 102,65,15,253,201 // paddw %xmm9,%xmm1 + .byte 102,68,15,111,206 // movdqa %xmm6,%xmm9 + .byte 102,69,15,56,11,200 // pmulhrsw %xmm8,%xmm9 + .byte 102,69,15,56,29,201 // pabsw %xmm9,%xmm9 + .byte 102,65,15,253,209 // paddw %xmm9,%xmm2 + .byte 102,68,15,56,11,199 // pmulhrsw %xmm7,%xmm8 + .byte 102,69,15,56,29,192 // pabsw %xmm8,%xmm8 + .byte 102,65,15,253,216 // paddw %xmm8,%xmm3 + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 255,224 // jmpq *%rax + BALIGN16 .byte 0,4,8 // add %al,(%rax,%rcx,1) .byte 12,1 // or $0x1,%al @@ -37117,10 +37152,7 @@ BALIGN16 .byte 3,7 // add (%rdi),%eax .byte 11,15 // or (%rdi),%ecx .byte 129,128,129,128,129,128,129,128,129,128// addl $0x80818081,-0x7f7e7f7f(%rax) - .byte 129 // .byte 0x81 - .byte 128 // .byte 0x80 - .byte 129 // .byte 0x81 - .byte 128 // .byte 0x80 - .byte 129 // .byte 0x81 - .byte 128 // .byte 0x80 + .byte 129,128,129,128,129,128,0,128,0,128 // addl $0x80008000,-0x7f7e7f7f(%rax) + .byte 0,128,0,128,0,128 // add %al,-0x7fff8000(%rax) + .byte 0,128,0,128,0,128 // add %al,-0x7fff8000(%rax) #endif diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S index ecbffaa201..67c9b86a2b 100644 --- a/src/jumper/SkJumper_generated_win.S +++ b/src/jumper/SkJumper_generated_win.S @@ -26366,7 +26366,7 @@ _sk_load_8888_ssse3_lowp LABEL PROC DB 117,113 ; jne 1b7 <_sk_load_8888_ssse3_lowp+0x7b> DB 69,15,16,76,147,16 ; movups 0x10(%r11,%rdx,4),%xmm9 DB 69,15,16,4,147 ; movups (%r11,%rdx,4),%xmm8 - DB 102,15,111,5,55,2,0,0 ; movdqa 0x237(%rip),%xmm0 # 390 <_sk_swap_rb_ssse3_lowp+0x14> + DB 102,15,111,5,167,2,0,0 ; movdqa 0x2a7(%rip),%xmm0 # 400 <_sk_srcover_ssse3_lowp+0x65> DB 102,68,15,56,0,192 ; pshufb %xmm0,%xmm8 DB 102,68,15,56,0,200 ; pshufb %xmm0,%xmm9 DB 102,65,15,111,208 ; movdqa %xmm8,%xmm2 @@ -26380,7 +26380,7 @@ _sk_load_8888_ssse3_lowp LABEL PROC DB 102,15,239,210 ; pxor %xmm2,%xmm2 DB 102,65,15,96,208 ; punpcklbw %xmm8,%xmm2 DB 102,65,15,104,216 ; punpckhbw %xmm8,%xmm3 - DB 102,68,15,111,5,1,2,0,0 ; movdqa 0x201(%rip),%xmm8 # 3a0 <_sk_swap_rb_ssse3_lowp+0x24> + DB 102,68,15,111,5,113,2,0,0 ; movdqa 0x271(%rip),%xmm8 # 410 <_sk_srcover_ssse3_lowp+0x75> DB 102,65,15,228,192 ; pmulhuw %xmm8,%xmm0 DB 102,65,15,228,200 ; pmulhuw %xmm8,%xmm1 DB 102,65,15,228,208 ; pmulhuw %xmm8,%xmm2 @@ -26518,6 +26518,37 @@ _sk_swap_rb_ssse3_lowp LABEL PROC DB 65,15,40,208 ; movaps %xmm8,%xmm2 DB 255,224 ; jmpq *%rax +PUBLIC _sk_move_src_dst_ssse3_lowp +_sk_move_src_dst_ssse3_lowp LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 15,40,224 ; movaps %xmm0,%xmm4 + DB 15,40,233 ; movaps %xmm1,%xmm5 + DB 15,40,242 ; movaps %xmm2,%xmm6 + DB 15,40,251 ; movaps %xmm3,%xmm7 + DB 255,224 ; jmpq *%rax + +PUBLIC _sk_srcover_ssse3_lowp +_sk_srcover_ssse3_lowp LABEL PROC + DB 102,68,15,111,5,124,0,0,0 ; movdqa 0x7c(%rip),%xmm8 # 420 <_sk_srcover_ssse3_lowp+0x85> + DB 102,68,15,249,195 ; psubw %xmm3,%xmm8 + DB 102,68,15,111,204 ; movdqa %xmm4,%xmm9 + DB 102,69,15,56,11,200 ; pmulhrsw %xmm8,%xmm9 + DB 102,69,15,56,29,201 ; pabsw %xmm9,%xmm9 + DB 102,65,15,253,193 ; paddw %xmm9,%xmm0 + DB 102,68,15,111,205 ; movdqa %xmm5,%xmm9 + DB 102,69,15,56,11,200 ; pmulhrsw %xmm8,%xmm9 + DB 102,69,15,56,29,201 ; pabsw %xmm9,%xmm9 + DB 102,65,15,253,201 ; paddw %xmm9,%xmm1 + DB 102,68,15,111,206 ; movdqa %xmm6,%xmm9 + DB 102,69,15,56,11,200 ; pmulhrsw %xmm8,%xmm9 + DB 102,69,15,56,29,201 ; pabsw %xmm9,%xmm9 + DB 102,65,15,253,209 ; paddw %xmm9,%xmm2 + DB 102,68,15,56,11,199 ; pmulhrsw %xmm7,%xmm8 + DB 102,69,15,56,29,192 ; pabsw %xmm8,%xmm8 + DB 102,65,15,253,216 ; paddw %xmm8,%xmm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + ALIGN 16 DB 0,4,8 ; add %al,(%rax,%rcx,1) DB 12,1 ; or $0x1,%al @@ -26526,11 +26557,8 @@ ALIGN 16 DB 3,7 ; add (%rdi),%eax DB 11,15 ; or (%rdi),%ecx DB 129,128,129,128,129,128,129,128,129,128; addl $0x80818081,-0x7f7e7f7f(%rax) - DB 129 ; .byte 0x81 - DB 128 ; .byte 0x80 - DB 129 ; .byte 0x81 - DB 128 ; .byte 0x80 - DB 129 ; .byte 0x81 - DB 128 ; .byte 0x80 + DB 129,128,129,128,129,128,0,128,0,128 ; addl $0x80008000,-0x7f7e7f7f(%rax) + DB 0,128,0,128,0,128 ; add %al,-0x7fff8000(%rax) + DB 0,128,0,128,0,128 ; add %al,-0x7fff8000(%rax) ENDIF END diff --git a/src/jumper/SkJumper_stages_lowp.cpp b/src/jumper/SkJumper_stages_lowp.cpp index 4d0df94af6..139382377d 100644 --- a/src/jumper/SkJumper_stages_lowp.cpp +++ b/src/jumper/SkJumper_stages_lowp.cpp @@ -28,7 +28,7 @@ struct F { U16 vec; F() = default; - F(uint16_t bits) : vec(bits) {} + F(float f) : vec((uint16_t)(f * 0x8000)) {} F(U16 v) : vec(v) {} operator U16() const { return vec; } @@ -38,6 +38,7 @@ SI F operator+(F x, F y) { return x.vec + y.vec; } SI F operator-(F x, F y) { return x.vec - y.vec; } SI F operator*(F x, F y) { return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec)); } SI F mad(F f, F m, F a) { return f*m+a; } +SI F inv(F v) { return 1.0f - v; } SI F operator<<(F x, int bits) { return x.vec << bits; } SI F operator>>(F x, int bits) { return x.vec >> bits; } @@ -178,3 +179,25 @@ STAGE(swap_rb) { r = b; b = tmp; } + +STAGE(move_src_dst) { + dr = r; + dg = g; + db = b; + da = a; +} + +// Most blend modes apply the same logic to each channel. +#define BLEND_MODE(name) \ + SI F name##_channel(F s, F d, F sa, F da); \ + STAGE(name) { \ + r = name##_channel(r,dr,a,da); \ + g = name##_channel(g,dg,a,da); \ + b = name##_channel(b,db,a,da); \ + a = name##_channel(a,da,a,da); \ + } \ + SI F name##_channel(F s, F d, F sa, F da) + +BLEND_MODE(srcover) { return mad(d, inv(sa), s); } + +#undef BLEND_MODE |