diff options
Diffstat (limited to 'src/jumper')
-rw-r--r-- | src/jumper/SkJumper.cpp | 74 | ||||
-rw-r--r-- | src/jumper/SkJumper_generated.S | 261 | ||||
-rw-r--r-- | src/jumper/SkJumper_generated_win.S | 269 | ||||
-rw-r--r-- | src/jumper/SkJumper_stages_lowp.cpp | 180 | ||||
-rwxr-xr-x | src/jumper/build_stages.py | 12 |
5 files changed, 777 insertions, 19 deletions
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp index c5840d3522..77a5856027 100644 --- a/src/jumper/SkJumper.cpp +++ b/src/jumper/SkJumper.cpp @@ -29,6 +29,7 @@ static K kConstants = { // We can't express the real types of most stage functions portably, so we use a stand-in. // We'll only ever call start_pipeline(), which then chains into the rest for us. using StageFn = void(void); +using StartPipelineFn = void(size_t,size_t,size_t,void**,K*); // Some platforms expect C "name" maps to asm "_name", others to "name". #if defined(__APPLE__) @@ -43,29 +44,31 @@ extern "C" { // We'll just run portable code. #elif defined(__aarch64__) - void ASM(start_pipeline,aarch64)(size_t,size_t,size_t, void**, K*); + StartPipelineFn ASM(start_pipeline,aarch64); StageFn ASM(just_return,aarch64); #define M(st) StageFn ASM(st,aarch64); SK_RASTER_PIPELINE_STAGES(M) #undef M #elif defined(__arm__) - void ASM(start_pipeline,vfp4)(size_t,size_t,size_t, void**, K*); + StartPipelineFn ASM(start_pipeline,vfp4); StageFn ASM(just_return,vfp4); #define M(st) StageFn ASM(st,vfp4); SK_RASTER_PIPELINE_STAGES(M) #undef M #elif defined(__x86_64__) || defined(_M_X64) - void ASM(start_pipeline,hsw )(size_t,size_t,size_t, void**, K*); - void ASM(start_pipeline,avx )(size_t,size_t,size_t, void**, K*); - void ASM(start_pipeline,sse41)(size_t,size_t,size_t, void**, K*); - void ASM(start_pipeline,sse2 )(size_t,size_t,size_t, void**, K*); + StartPipelineFn ASM(start_pipeline,hsw ), + ASM(start_pipeline,avx ), + ASM(start_pipeline,sse41 ), + ASM(start_pipeline,sse2 ), + ASM(start_pipeline,ssse3_lowp); StageFn ASM(just_return,hsw), ASM(just_return,avx), ASM(just_return,sse41), - ASM(just_return,sse2); + ASM(just_return,sse2), + ASM(just_return,ssse3_lowp); #define M(st) StageFn ASM(st,hsw); SK_RASTER_PIPELINE_STAGES(M) @@ -79,10 +82,14 @@ extern "C" { #define M(st) StageFn ASM(st,sse2); SK_RASTER_PIPELINE_STAGES(M) #undef M + + StageFn ASM(load_8888, ssse3_lowp), + ASM(store_8888, ssse3_lowp), + ASM(swap_rb, ssse3_lowp); #endif // Portable, single-pixel stages. - void sk_start_pipeline(size_t,size_t,size_t, void**, K*); + StartPipelineFn sk_start_pipeline; StageFn sk_just_return; #define M(st) StageFn sk_##st; SK_RASTER_PIPELINE_STAGES(M) @@ -170,43 +177,72 @@ static SkJumper_Engine choose_engine() { return kPortable; } -void SkRasterPipeline::build_pipeline(const SkJumper_Engine& engine, void** ip) const { - const StageList* st = fStages; +StartPipelineFn* SkRasterPipeline::build_pipeline(void** ip) const { +#if defined(__x86_64__) || defined(_M_X64) + if (SkCpu::Supports(SkCpu::SSSE3)) { + void** reset_point = ip; + + *--ip = (void*)ASM(just_return,ssse3_lowp); + for (const StageList* st = fStages; st; st = st->prev) { + StageFn* fn = nullptr; + switch (st->stage) { + case SkRasterPipeline::load_8888: fn = ASM(load_8888, ssse3_lowp); break; + case SkRasterPipeline::store_8888: fn = ASM(store_8888,ssse3_lowp); break; + case SkRasterPipeline::swap_rb: fn = ASM(swap_rb, ssse3_lowp); break; + default: + //SkDebugf("can't %d\n", st->stage); + ip = reset_point; + } + if (ip == reset_point) { + break; + } + if (st->ctx) { + *--ip = st->ctx; + } + *--ip = (void*)fn; + } + + if (ip != reset_point) { + return ASM(start_pipeline,ssse3_lowp); + } + } +#endif + gChooseEngineOnce([]{ gEngine = choose_engine(); }); + // We're building the pipeline backwards, so we start with the final stage just_return. - *--ip = (void*)engine.just_return; + *--ip = (void*)gEngine.just_return; // Still going backwards, each stage's context pointer then its StageFn. - for (; st; st = st->prev) { + for (const StageList* st = fStages; st; st = st->prev) { if (st->ctx) { *--ip = st->ctx; } - *--ip = (void*)engine.stages[st->stage]; + *--ip = (void*)gEngine.stages[st->stage]; } + return gEngine.start_pipeline; } void SkRasterPipeline::run(size_t x, size_t y, size_t n) const { if (this->empty()) { return; } - gChooseEngineOnce([]{ gEngine = choose_engine(); }); // Best to not use fAlloc here... we can't bound how often run() will be called. SkAutoSTMalloc<64, void*> program(fSlotsNeeded); - this->build_pipeline(gEngine, program.get() + fSlotsNeeded); - gEngine.start_pipeline(x,y,x+n, program.get(), &kConstants); + auto start_pipeline = this->build_pipeline(program.get() + fSlotsNeeded); + start_pipeline(x,y,x+n, program.get(), &kConstants); } std::function<void(size_t, size_t, size_t)> SkRasterPipeline::compile() const { if (this->empty()) { return [](size_t, size_t, size_t) {}; } - gChooseEngineOnce([]{ gEngine = choose_engine(); }); void** program = fAlloc->makeArray<void*>(fSlotsNeeded); - this->build_pipeline(gEngine, program + fSlotsNeeded); + auto start_pipeline = this->build_pipeline(program + fSlotsNeeded); return [=](size_t x, size_t y, size_t n) { - gEngine.start_pipeline(x,y,x+n, program, &kConstants); + start_pipeline(x,y,x+n, program, &kConstants); }; } diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S index 8379663481..84eb7a5e13 100644 --- a/src/jumper/SkJumper_generated.S +++ b/src/jumper/SkJumper_generated.S @@ -36884,4 +36884,265 @@ BALIGN16 .byte 170 // stos %al,%es:(%rdi) .byte 170 // stos %al,%es:(%rdi) .byte 190,171,170,170,190 // mov $0xbeaaaaab,%esi +BALIGN32 + +HIDDEN _sk_start_pipeline_ssse3_lowp +.globl _sk_start_pipeline_ssse3_lowp +FUNCTION(_sk_start_pipeline_ssse3_lowp) +_sk_start_pipeline_ssse3_lowp: + .byte 85 // push %rbp + .byte 65,87 // push %r15 + .byte 65,86 // push %r14 + .byte 65,85 // push %r13 + .byte 65,84 // push %r12 + .byte 83 // push %rbx + .byte 80 // push %rax + .byte 77,137,198 // mov %r8,%r14 + .byte 73,137,213 // mov %rdx,%r13 + .byte 73,137,247 // mov %rsi,%r15 + .byte 72,137,251 // mov %rdi,%rbx + .byte 72,137,206 // mov %rcx,%rsi + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 73,137,244 // mov %rsi,%r12 + .byte 72,141,75,8 // lea 0x8(%rbx),%rcx + .byte 76,57,233 // cmp %r13,%rcx + .byte 118,5 // jbe 2d <_sk_start_pipeline_ssse3_lowp+0x2d> + .byte 72,137,218 // mov %rbx,%rdx + .byte 235,66 // jmp 6f <_sk_start_pipeline_ssse3_lowp+0x6f> + .byte 65,184,0,0,0,0 // mov $0x0,%r8d + .byte 15,87,192 // xorps %xmm0,%xmm0 + .byte 15,87,201 // xorps %xmm1,%xmm1 + .byte 15,87,210 // xorps %xmm2,%xmm2 + .byte 15,87,219 // xorps %xmm3,%xmm3 + .byte 15,87,228 // xorps %xmm4,%xmm4 + .byte 15,87,237 // xorps %xmm5,%xmm5 + .byte 15,87,246 // xorps %xmm6,%xmm6 + .byte 15,87,255 // xorps %xmm7,%xmm7 + .byte 76,137,247 // mov %r14,%rdi + .byte 76,137,230 // mov %r12,%rsi + .byte 72,137,218 // mov %rbx,%rdx + .byte 76,137,249 // mov %r15,%rcx + .byte 72,137,197 // mov %rax,%rbp + .byte 255,213 // callq *%rbp + .byte 72,137,232 // mov %rbp,%rax + .byte 72,141,83,8 // lea 0x8(%rbx),%rdx + .byte 72,131,195,16 // add $0x10,%rbx + .byte 76,57,235 // cmp %r13,%rbx + .byte 72,137,211 // mov %rdx,%rbx + .byte 118,190 // jbe 2d <_sk_start_pipeline_ssse3_lowp+0x2d> + .byte 73,41,213 // sub %rdx,%r13 + .byte 116,52 // je a8 <_sk_start_pipeline_ssse3_lowp+0xa8> + .byte 15,87,192 // xorps %xmm0,%xmm0 + .byte 15,87,201 // xorps %xmm1,%xmm1 + .byte 15,87,210 // xorps %xmm2,%xmm2 + .byte 15,87,219 // xorps %xmm3,%xmm3 + .byte 15,87,228 // xorps %xmm4,%xmm4 + .byte 15,87,237 // xorps %xmm5,%xmm5 + .byte 15,87,246 // xorps %xmm6,%xmm6 + .byte 15,87,255 // xorps %xmm7,%xmm7 + .byte 76,137,247 // mov %r14,%rdi + .byte 76,137,230 // mov %r12,%rsi + .byte 76,137,249 // mov %r15,%rcx + .byte 77,137,232 // mov %r13,%r8 + .byte 72,131,196,8 // add $0x8,%rsp + .byte 91 // pop %rbx + .byte 65,92 // pop %r12 + .byte 65,93 // pop %r13 + .byte 65,94 // pop %r14 + .byte 65,95 // pop %r15 + .byte 93 // pop %rbp + .byte 255,224 // jmpq *%rax + .byte 72,131,196,8 // add $0x8,%rsp + .byte 91 // pop %rbx + .byte 65,92 // pop %r12 + .byte 65,93 // pop %r13 + .byte 65,94 // pop %r14 + .byte 65,95 // pop %r15 + .byte 93 // pop %rbp + .byte 195 // retq + +HIDDEN _sk_just_return_ssse3_lowp +.globl _sk_just_return_ssse3_lowp +FUNCTION(_sk_just_return_ssse3_lowp) +_sk_just_return_ssse3_lowp: + .byte 195 // retq + +HIDDEN _sk_load_8888_ssse3_lowp +.globl _sk_load_8888_ssse3_lowp +FUNCTION(_sk_load_8888_ssse3_lowp) +_sk_load_8888_ssse3_lowp: + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 76,139,24 // mov (%rax),%r11 + .byte 77,133,192 // test %r8,%r8 + .byte 117,120 // jne 13a <_sk_load_8888_ssse3_lowp+0x82> + .byte 65,15,16,76,147,16 // movups 0x10(%r11,%rdx,4),%xmm1 + .byte 65,15,16,4,147 // movups (%r11,%rdx,4),%xmm0 + .byte 15,40,208 // movaps %xmm0,%xmm2 + .byte 102,15,96,209 // punpcklbw %xmm1,%xmm2 + .byte 102,15,104,193 // punpckhbw %xmm1,%xmm0 + .byte 102,68,15,111,194 // movdqa %xmm2,%xmm8 + .byte 102,68,15,96,192 // punpcklbw %xmm0,%xmm8 + .byte 102,15,104,208 // punpckhbw %xmm0,%xmm2 + .byte 102,69,15,111,200 // movdqa %xmm8,%xmm9 + .byte 102,68,15,96,202 // punpcklbw %xmm2,%xmm9 + .byte 102,68,15,104,194 // punpckhbw %xmm2,%xmm8 + .byte 102,15,239,219 // pxor %xmm3,%xmm3 + .byte 102,15,239,192 // pxor %xmm0,%xmm0 + .byte 102,65,15,96,193 // punpcklbw %xmm9,%xmm0 + .byte 102,15,239,201 // pxor %xmm1,%xmm1 + .byte 102,65,15,104,201 // punpckhbw %xmm9,%xmm1 + .byte 102,15,239,210 // pxor %xmm2,%xmm2 + .byte 102,65,15,96,208 // punpcklbw %xmm8,%xmm2 + .byte 102,65,15,104,216 // punpckhbw %xmm8,%xmm3 + .byte 102,68,15,111,5,238,1,0,0 // movdqa 0x1ee(%rip),%xmm8 # 310 <_sk_swap_rb_ssse3_lowp+0x1c> + .byte 102,65,15,228,192 // pmulhuw %xmm8,%xmm0 + .byte 102,65,15,228,200 // pmulhuw %xmm8,%xmm1 + .byte 102,65,15,228,208 // pmulhuw %xmm8,%xmm2 + .byte 102,65,15,228,216 // pmulhuw %xmm8,%xmm3 + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 255,224 // jmpq *%rax + .byte 69,137,193 // mov %r8d,%r9d + .byte 65,128,225,7 // and $0x7,%r9b + .byte 102,15,239,201 // pxor %xmm1,%xmm1 + .byte 102,15,239,192 // pxor %xmm0,%xmm0 + .byte 65,254,201 // dec %r9b + .byte 65,128,249,6 // cmp $0x6,%r9b + .byte 15,135,119,255,255,255 // ja cd <_sk_load_8888_ssse3_lowp+0x15> + .byte 69,15,182,201 // movzbl %r9b,%r9d + .byte 76,141,21,119,0,0,0 // lea 0x77(%rip),%r10 # 1d8 <_sk_load_8888_ssse3_lowp+0x120> + .byte 75,99,4,138 // movslq (%r10,%r9,4),%rax + .byte 76,1,208 // add %r10,%rax + .byte 255,224 // jmpq *%rax + .byte 102,65,15,110,68,147,24 // movd 0x18(%r11,%rdx,4),%xmm0 + .byte 102,15,112,200,69 // pshufd $0x45,%xmm0,%xmm1 + .byte 102,15,239,192 // pxor %xmm0,%xmm0 + .byte 243,65,15,16,84,147,20 // movss 0x14(%r11,%rdx,4),%xmm2 + .byte 15,198,209,0 // shufps $0x0,%xmm1,%xmm2 + .byte 15,198,209,226 // shufps $0xe2,%xmm1,%xmm2 + .byte 15,40,202 // movaps %xmm2,%xmm1 + .byte 243,65,15,16,84,147,16 // movss 0x10(%r11,%rdx,4),%xmm2 + .byte 243,15,16,202 // movss %xmm2,%xmm1 + .byte 243,65,15,16,84,147,12 // movss 0xc(%r11,%rdx,4),%xmm2 + .byte 15,198,208,32 // shufps $0x20,%xmm0,%xmm2 + .byte 15,198,194,36 // shufps $0x24,%xmm2,%xmm0 + .byte 243,65,15,16,84,147,8 // movss 0x8(%r11,%rdx,4),%xmm2 + .byte 15,198,208,48 // shufps $0x30,%xmm0,%xmm2 + .byte 15,198,194,132 // shufps $0x84,%xmm2,%xmm0 + .byte 243,65,15,16,84,147,4 // movss 0x4(%r11,%rdx,4),%xmm2 + .byte 15,198,208,0 // shufps $0x0,%xmm0,%xmm2 + .byte 15,198,208,226 // shufps $0xe2,%xmm0,%xmm2 + .byte 15,40,194 // movaps %xmm2,%xmm0 + .byte 243,65,15,16,20,147 // movss (%r11,%rdx,4),%xmm2 + .byte 243,15,16,194 // movss %xmm2,%xmm0 + .byte 233,247,254,255,255 // jmpq cd <_sk_load_8888_ssse3_lowp+0x15> + .byte 102,144 // xchg %ax,%ax + .byte 239 // out %eax,(%dx) + .byte 255 // (bad) + .byte 255 // (bad) + .byte 255 // (bad) + .byte 221,255 // (bad) + .byte 255 // (bad) + .byte 255,206 // dec %esi + .byte 255 // (bad) + .byte 255 // (bad) + .byte 255 // (bad) + .byte 191,255,255,255,180 // mov $0xb4ffffff,%edi + .byte 255 // (bad) + .byte 255 // (bad) + .byte 255,162,255,255,255,146 // jmpq *-0x6d000001(%rdx) + .byte 255 // (bad) + .byte 255 // (bad) + .byte 255 // .byte 0xff + +HIDDEN _sk_store_8888_ssse3_lowp +.globl _sk_store_8888_ssse3_lowp +FUNCTION(_sk_store_8888_ssse3_lowp) +_sk_store_8888_ssse3_lowp: + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 76,139,24 // mov (%rax),%r11 + .byte 102,68,15,111,208 // movdqa %xmm0,%xmm10 + .byte 102,65,15,113,210,7 // psrlw $0x7,%xmm10 + .byte 102,68,15,111,194 // movdqa %xmm2,%xmm8 + .byte 102,65,15,113,208,7 // psrlw $0x7,%xmm8 + .byte 102,69,15,103,208 // packuswb %xmm8,%xmm10 + .byte 102,68,15,111,193 // movdqa %xmm1,%xmm8 + .byte 102,65,15,113,208,7 // psrlw $0x7,%xmm8 + .byte 102,68,15,111,203 // movdqa %xmm3,%xmm9 + .byte 102,65,15,113,209,7 // psrlw $0x7,%xmm9 + .byte 102,69,15,103,193 // packuswb %xmm9,%xmm8 + .byte 102,69,15,111,202 // movdqa %xmm10,%xmm9 + .byte 102,69,15,96,200 // punpcklbw %xmm8,%xmm9 + .byte 102,69,15,104,208 // punpckhbw %xmm8,%xmm10 + .byte 102,69,15,111,193 // movdqa %xmm9,%xmm8 + .byte 102,69,15,97,194 // punpcklwd %xmm10,%xmm8 + .byte 102,69,15,105,202 // punpckhwd %xmm10,%xmm9 + .byte 77,133,192 // test %r8,%r8 + .byte 117,17 // jne 263 <_sk_store_8888_ssse3_lowp+0x6f> + .byte 243,69,15,127,76,147,16 // movdqu %xmm9,0x10(%r11,%rdx,4) + .byte 243,69,15,127,4,147 // movdqu %xmm8,(%r11,%rdx,4) + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 255,224 // jmpq *%rax + .byte 69,137,193 // mov %r8d,%r9d + .byte 65,128,225,7 // and $0x7,%r9b + .byte 65,254,201 // dec %r9b + .byte 65,128,249,6 // cmp $0x6,%r9b + .byte 119,236 // ja 25f <_sk_store_8888_ssse3_lowp+0x6b> + .byte 69,15,182,201 // movzbl %r9b,%r9d + .byte 76,141,21,90,0,0,0 // lea 0x5a(%rip),%r10 # 2d8 <_sk_store_8888_ssse3_lowp+0xe4> + .byte 75,99,4,138 // movslq (%r10,%r9,4),%rax + .byte 76,1,208 // add %r10,%rax + .byte 255,224 // jmpq *%rax + .byte 102,69,15,112,209,78 // pshufd $0x4e,%xmm9,%xmm10 + .byte 102,69,15,126,84,147,24 // movd %xmm10,0x18(%r11,%rdx,4) + .byte 102,69,15,112,209,229 // pshufd $0xe5,%xmm9,%xmm10 + .byte 102,69,15,126,84,147,20 // movd %xmm10,0x14(%r11,%rdx,4) + .byte 102,69,15,126,76,147,16 // movd %xmm9,0x10(%r11,%rdx,4) + .byte 102,69,15,112,200,231 // pshufd $0xe7,%xmm8,%xmm9 + .byte 102,69,15,126,76,147,12 // movd %xmm9,0xc(%r11,%rdx,4) + .byte 102,69,15,112,200,78 // pshufd $0x4e,%xmm8,%xmm9 + .byte 102,69,15,126,76,147,8 // movd %xmm9,0x8(%r11,%rdx,4) + .byte 102,69,15,112,200,229 // pshufd $0xe5,%xmm8,%xmm9 + .byte 102,69,15,126,76,147,4 // movd %xmm9,0x4(%r11,%rdx,4) + .byte 102,69,15,126,4,147 // movd %xmm8,(%r11,%rdx,4) + .byte 235,136 // jmp 25f <_sk_store_8888_ssse3_lowp+0x6b> + .byte 144 // nop + .byte 247,255 // idiv %edi + .byte 255 // (bad) + .byte 255 // (bad) + .byte 234 // (bad) + .byte 255 // (bad) + .byte 255 // (bad) + .byte 255 // (bad) + .byte 221,255 // (bad) + .byte 255 // (bad) + .byte 255,208 // callq *%rax + .byte 255 // (bad) + .byte 255 // (bad) + .byte 255,201 // dec %ecx + .byte 255 // (bad) + .byte 255 // (bad) + .byte 255 // (bad) + .byte 188,255,255,255,175 // mov $0xafffffff,%esp + .byte 255 // (bad) + .byte 255 // (bad) + .byte 255 // .byte 0xff + +HIDDEN _sk_swap_rb_ssse3_lowp +.globl _sk_swap_rb_ssse3_lowp +FUNCTION(_sk_swap_rb_ssse3_lowp) +_sk_swap_rb_ssse3_lowp: + .byte 68,15,40,192 // movaps %xmm0,%xmm8 + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 15,40,194 // movaps %xmm2,%xmm0 + .byte 65,15,40,208 // movaps %xmm8,%xmm2 + .byte 255,224 // jmpq *%rax + +BALIGN16 + .byte 129,128,129,128,129,128,129,128,129,128// addl $0x80818081,-0x7f7e7f7f(%rax) + .byte 129 // .byte 0x81 + .byte 128 // .byte 0x80 + .byte 129 // .byte 0x81 + .byte 128 // .byte 0x80 + .byte 129 // .byte 0x81 + .byte 128 // .byte 0x80 #endif diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S index 33d209ea38..ebdd645410 100644 --- a/src/jumper/SkJumper_generated_win.S +++ b/src/jumper/SkJumper_generated_win.S @@ -26262,5 +26262,274 @@ ALIGN 16 DB 170 ; stos %al,%es:(%rdi) DB 170 ; stos %al,%es:(%rdi) DB 190,171,170,170,190 ; mov $0xbeaaaaab,%esi +ALIGN 32 + +PUBLIC _sk_start_pipeline_ssse3_lowp +_sk_start_pipeline_ssse3_lowp LABEL PROC + DB 65,87 ; push %r15 + DB 65,86 ; push %r14 + DB 65,85 ; push %r13 + DB 65,84 ; push %r12 + DB 86 ; push %rsi + DB 87 ; push %rdi + DB 85 ; push %rbp + DB 83 ; push %rbx + DB 72,129,236,168,0,0,0 ; sub $0xa8,%rsp + DB 68,15,41,188,36,144,0,0,0 ; movaps %xmm15,0x90(%rsp) + DB 68,15,41,180,36,128,0,0,0 ; movaps %xmm14,0x80(%rsp) + DB 68,15,41,108,36,112 ; movaps %xmm13,0x70(%rsp) + DB 68,15,41,100,36,96 ; movaps %xmm12,0x60(%rsp) + DB 68,15,41,92,36,80 ; movaps %xmm11,0x50(%rsp) + DB 68,15,41,84,36,64 ; movaps %xmm10,0x40(%rsp) + DB 68,15,41,76,36,48 ; movaps %xmm9,0x30(%rsp) + DB 68,15,41,68,36,32 ; movaps %xmm8,0x20(%rsp) + DB 15,41,124,36,16 ; movaps %xmm7,0x10(%rsp) + DB 15,41,52,36 ; movaps %xmm6,(%rsp) + DB 76,137,195 ; mov %r8,%rbx + DB 73,137,214 ; mov %rdx,%r14 + DB 72,137,205 ; mov %rcx,%rbp + DB 76,139,188,36,16,1,0,0 ; mov 0x110(%rsp),%r15 + DB 76,137,206 ; mov %r9,%rsi + DB 72,173 ; lods %ds:(%rsi),%rax + DB 73,137,196 ; mov %rax,%r12 + DB 73,137,245 ; mov %rsi,%r13 + DB 72,141,69,8 ; lea 0x8(%rbp),%rax + DB 72,57,216 ; cmp %rbx,%rax + DB 118,5 ; jbe 7c <_sk_start_pipeline_ssse3_lowp+0x7c> + DB 72,137,234 ; mov %rbp,%rdx + DB 235,61 ; jmp b9 <_sk_start_pipeline_ssse3_lowp+0xb9> + DB 65,184,0,0,0,0 ; mov $0x0,%r8d + DB 15,87,192 ; xorps %xmm0,%xmm0 + DB 15,87,201 ; xorps %xmm1,%xmm1 + DB 15,87,210 ; xorps %xmm2,%xmm2 + DB 15,87,219 ; xorps %xmm3,%xmm3 + DB 15,87,228 ; xorps %xmm4,%xmm4 + DB 15,87,237 ; xorps %xmm5,%xmm5 + DB 15,87,246 ; xorps %xmm6,%xmm6 + DB 15,87,255 ; xorps %xmm7,%xmm7 + DB 76,137,255 ; mov %r15,%rdi + DB 76,137,238 ; mov %r13,%rsi + DB 72,137,234 ; mov %rbp,%rdx + DB 76,137,241 ; mov %r14,%rcx + DB 65,255,212 ; callq *%r12 + DB 72,141,85,8 ; lea 0x8(%rbp),%rdx + DB 72,131,197,16 ; add $0x10,%rbp + DB 72,57,221 ; cmp %rbx,%rbp + DB 72,137,213 ; mov %rdx,%rbp + DB 118,195 ; jbe 7c <_sk_start_pipeline_ssse3_lowp+0x7c> + DB 72,41,211 ; sub %rdx,%rbx + DB 116,39 ; je e5 <_sk_start_pipeline_ssse3_lowp+0xe5> + DB 15,87,192 ; xorps %xmm0,%xmm0 + DB 15,87,201 ; xorps %xmm1,%xmm1 + DB 15,87,210 ; xorps %xmm2,%xmm2 + DB 15,87,219 ; xorps %xmm3,%xmm3 + DB 15,87,228 ; xorps %xmm4,%xmm4 + DB 15,87,237 ; xorps %xmm5,%xmm5 + DB 15,87,246 ; xorps %xmm6,%xmm6 + DB 15,87,255 ; xorps %xmm7,%xmm7 + DB 76,137,255 ; mov %r15,%rdi + DB 76,137,238 ; mov %r13,%rsi + DB 76,137,241 ; mov %r14,%rcx + DB 73,137,216 ; mov %rbx,%r8 + DB 65,255,212 ; callq *%r12 + DB 15,40,52,36 ; movaps (%rsp),%xmm6 + DB 15,40,124,36,16 ; movaps 0x10(%rsp),%xmm7 + DB 68,15,40,68,36,32 ; movaps 0x20(%rsp),%xmm8 + DB 68,15,40,76,36,48 ; movaps 0x30(%rsp),%xmm9 + DB 68,15,40,84,36,64 ; movaps 0x40(%rsp),%xmm10 + DB 68,15,40,92,36,80 ; movaps 0x50(%rsp),%xmm11 + DB 68,15,40,100,36,96 ; movaps 0x60(%rsp),%xmm12 + DB 68,15,40,108,36,112 ; movaps 0x70(%rsp),%xmm13 + DB 68,15,40,180,36,128,0,0,0 ; movaps 0x80(%rsp),%xmm14 + DB 68,15,40,188,36,144,0,0,0 ; movaps 0x90(%rsp),%xmm15 + DB 72,129,196,168,0,0,0 ; add $0xa8,%rsp + DB 91 ; pop %rbx + DB 93 ; pop %rbp + DB 95 ; pop %rdi + DB 94 ; pop %rsi + DB 65,92 ; pop %r12 + DB 65,93 ; pop %r13 + DB 65,94 ; pop %r14 + DB 65,95 ; pop %r15 + DB 195 ; retq + +PUBLIC _sk_just_return_ssse3_lowp +_sk_just_return_ssse3_lowp LABEL PROC + DB 195 ; retq + +PUBLIC _sk_load_8888_ssse3_lowp +_sk_load_8888_ssse3_lowp LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 76,139,24 ; mov (%rax),%r11 + DB 77,133,192 ; test %r8,%r8 + DB 117,120 ; jne 1bb <_sk_load_8888_ssse3_lowp+0x82> + DB 65,15,16,76,147,16 ; movups 0x10(%r11,%rdx,4),%xmm1 + DB 65,15,16,4,147 ; movups (%r11,%rdx,4),%xmm0 + DB 15,40,208 ; movaps %xmm0,%xmm2 + DB 102,15,96,209 ; punpcklbw %xmm1,%xmm2 + DB 102,15,104,193 ; punpckhbw %xmm1,%xmm0 + DB 102,68,15,111,194 ; movdqa %xmm2,%xmm8 + DB 102,68,15,96,192 ; punpcklbw %xmm0,%xmm8 + DB 102,15,104,208 ; punpckhbw %xmm0,%xmm2 + DB 102,69,15,111,200 ; movdqa %xmm8,%xmm9 + DB 102,68,15,96,202 ; punpcklbw %xmm2,%xmm9 + DB 102,68,15,104,194 ; punpckhbw %xmm2,%xmm8 + DB 102,15,239,219 ; pxor %xmm3,%xmm3 + DB 102,15,239,192 ; pxor %xmm0,%xmm0 + DB 102,65,15,96,193 ; punpcklbw %xmm9,%xmm0 + DB 102,15,239,201 ; pxor %xmm1,%xmm1 + DB 102,65,15,104,201 ; punpckhbw %xmm9,%xmm1 + DB 102,15,239,210 ; pxor %xmm2,%xmm2 + DB 102,65,15,96,208 ; punpcklbw %xmm8,%xmm2 + DB 102,65,15,104,216 ; punpckhbw %xmm8,%xmm3 + DB 102,68,15,111,5,237,1,0,0 ; movdqa 0x1ed(%rip),%xmm8 # 390 <_sk_swap_rb_ssse3_lowp+0x1c> + DB 102,65,15,228,192 ; pmulhuw %xmm8,%xmm0 + DB 102,65,15,228,200 ; pmulhuw %xmm8,%xmm1 + DB 102,65,15,228,208 ; pmulhuw %xmm8,%xmm2 + DB 102,65,15,228,216 ; pmulhuw %xmm8,%xmm3 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + DB 69,137,193 ; mov %r8d,%r9d + DB 65,128,225,7 ; and $0x7,%r9b + DB 102,15,239,201 ; pxor %xmm1,%xmm1 + DB 102,15,239,192 ; pxor %xmm0,%xmm0 + DB 65,254,201 ; dec %r9b + DB 65,128,249,6 ; cmp $0x6,%r9b + DB 15,135,119,255,255,255 ; ja 14e <_sk_load_8888_ssse3_lowp+0x15> + DB 69,15,182,201 ; movzbl %r9b,%r9d + DB 76,141,21,118,0,0,0 ; lea 0x76(%rip),%r10 # 258 <_sk_load_8888_ssse3_lowp+0x11f> + DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax + DB 76,1,208 ; add %r10,%rax + DB 255,224 ; jmpq *%rax + DB 102,65,15,110,68,147,24 ; movd 0x18(%r11,%rdx,4),%xmm0 + DB 102,15,112,200,69 ; pshufd $0x45,%xmm0,%xmm1 + DB 102,15,239,192 ; pxor %xmm0,%xmm0 + DB 243,65,15,16,84,147,20 ; movss 0x14(%r11,%rdx,4),%xmm2 + DB 15,198,209,0 ; shufps $0x0,%xmm1,%xmm2 + DB 15,198,209,226 ; shufps $0xe2,%xmm1,%xmm2 + DB 15,40,202 ; movaps %xmm2,%xmm1 + DB 243,65,15,16,84,147,16 ; movss 0x10(%r11,%rdx,4),%xmm2 + DB 243,15,16,202 ; movss %xmm2,%xmm1 + DB 243,65,15,16,84,147,12 ; movss 0xc(%r11,%rdx,4),%xmm2 + DB 15,198,208,32 ; shufps $0x20,%xmm0,%xmm2 + DB 15,198,194,36 ; shufps $0x24,%xmm2,%xmm0 + DB 243,65,15,16,84,147,8 ; movss 0x8(%r11,%rdx,4),%xmm2 + DB 15,198,208,48 ; shufps $0x30,%xmm0,%xmm2 + DB 15,198,194,132 ; shufps $0x84,%xmm2,%xmm0 + DB 243,65,15,16,84,147,4 ; movss 0x4(%r11,%rdx,4),%xmm2 + DB 15,198,208,0 ; shufps $0x0,%xmm0,%xmm2 + DB 15,198,208,226 ; shufps $0xe2,%xmm0,%xmm2 + DB 15,40,194 ; movaps %xmm2,%xmm0 + DB 243,65,15,16,20,147 ; movss (%r11,%rdx,4),%xmm2 + DB 243,15,16,194 ; movss %xmm2,%xmm0 + DB 233,247,254,255,255 ; jmpq 14e <_sk_load_8888_ssse3_lowp+0x15> + DB 144 ; nop + DB 240,255 ; lock (bad) + DB 255 ; (bad) + DB 255 ; (bad) + DB 222,255 ; fdivrp %st,%st(7) + DB 255 ; (bad) + DB 255,207 ; dec %edi + DB 255 ; (bad) + DB 255 ; (bad) + DB 255,192 ; inc %eax + DB 255 ; (bad) + DB 255 ; (bad) + DB 255,181,255,255,255,163 ; pushq -0x5c000001(%rbp) + DB 255 ; (bad) + DB 255 ; (bad) + DB 255 ; .byte 0xff + DB 147 ; xchg %eax,%ebx + DB 255 ; (bad) + DB 255 ; (bad) + DB 255 ; .byte 0xff + +PUBLIC _sk_store_8888_ssse3_lowp +_sk_store_8888_ssse3_lowp LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 76,139,24 ; mov (%rax),%r11 + DB 102,68,15,111,208 ; movdqa %xmm0,%xmm10 + DB 102,65,15,113,210,7 ; psrlw $0x7,%xmm10 + DB 102,68,15,111,194 ; movdqa %xmm2,%xmm8 + DB 102,65,15,113,208,7 ; psrlw $0x7,%xmm8 + DB 102,69,15,103,208 ; packuswb %xmm8,%xmm10 + DB 102,68,15,111,193 ; movdqa %xmm1,%xmm8 + DB 102,65,15,113,208,7 ; psrlw $0x7,%xmm8 + DB 102,68,15,111,203 ; movdqa %xmm3,%xmm9 + DB 102,65,15,113,209,7 ; psrlw $0x7,%xmm9 + DB 102,69,15,103,193 ; packuswb %xmm9,%xmm8 + DB 102,69,15,111,202 ; movdqa %xmm10,%xmm9 + DB 102,69,15,96,200 ; punpcklbw %xmm8,%xmm9 + DB 102,69,15,104,208 ; punpckhbw %xmm8,%xmm10 + DB 102,69,15,111,193 ; movdqa %xmm9,%xmm8 + DB 102,69,15,97,194 ; punpcklwd %xmm10,%xmm8 + DB 102,69,15,105,202 ; punpckhwd %xmm10,%xmm9 + DB 77,133,192 ; test %r8,%r8 + DB 117,17 ; jne 2e3 <_sk_store_8888_ssse3_lowp+0x6f> + DB 243,69,15,127,76,147,16 ; movdqu %xmm9,0x10(%r11,%rdx,4) + DB 243,69,15,127,4,147 ; movdqu %xmm8,(%r11,%rdx,4) + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + DB 69,137,193 ; mov %r8d,%r9d + DB 65,128,225,7 ; and $0x7,%r9b + DB 65,254,201 ; dec %r9b + DB 65,128,249,6 ; cmp $0x6,%r9b + DB 119,236 ; ja 2df <_sk_store_8888_ssse3_lowp+0x6b> + DB 69,15,182,201 ; movzbl %r9b,%r9d + DB 76,141,21,90,0,0,0 ; lea 0x5a(%rip),%r10 # 358 <_sk_store_8888_ssse3_lowp+0xe4> + DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax + DB 76,1,208 ; add %r10,%rax + DB 255,224 ; jmpq *%rax + DB 102,69,15,112,209,78 ; pshufd $0x4e,%xmm9,%xmm10 + DB 102,69,15,126,84,147,24 ; movd %xmm10,0x18(%r11,%rdx,4) + DB 102,69,15,112,209,229 ; pshufd $0xe5,%xmm9,%xmm10 + DB 102,69,15,126,84,147,20 ; movd %xmm10,0x14(%r11,%rdx,4) + DB 102,69,15,126,76,147,16 ; movd %xmm9,0x10(%r11,%rdx,4) + DB 102,69,15,112,200,231 ; pshufd $0xe7,%xmm8,%xmm9 + DB 102,69,15,126,76,147,12 ; movd %xmm9,0xc(%r11,%rdx,4) + DB 102,69,15,112,200,78 ; pshufd $0x4e,%xmm8,%xmm9 + DB 102,69,15,126,76,147,8 ; movd %xmm9,0x8(%r11,%rdx,4) + DB 102,69,15,112,200,229 ; pshufd $0xe5,%xmm8,%xmm9 + DB 102,69,15,126,76,147,4 ; movd %xmm9,0x4(%r11,%rdx,4) + DB 102,69,15,126,4,147 ; movd %xmm8,(%r11,%rdx,4) + DB 235,136 ; jmp 2df <_sk_store_8888_ssse3_lowp+0x6b> + DB 144 ; nop + DB 247,255 ; idiv %edi + DB 255 ; (bad) + DB 255 ; (bad) + DB 234 ; (bad) + DB 255 ; (bad) + DB 255 ; (bad) + DB 255 ; (bad) + DB 221,255 ; (bad) + DB 255 ; (bad) + DB 255,208 ; callq *%rax + DB 255 ; (bad) + DB 255 ; (bad) + DB 255,201 ; dec %ecx + DB 255 ; (bad) + DB 255 ; (bad) + DB 255 ; (bad) + DB 188,255,255,255,175 ; mov $0xafffffff,%esp + DB 255 ; (bad) + DB 255 ; (bad) + DB 255 ; .byte 0xff + +PUBLIC _sk_swap_rb_ssse3_lowp +_sk_swap_rb_ssse3_lowp LABEL PROC + DB 68,15,40,192 ; movaps %xmm0,%xmm8 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 15,40,194 ; movaps %xmm2,%xmm0 + DB 65,15,40,208 ; movaps %xmm8,%xmm2 + DB 255,224 ; jmpq *%rax + +ALIGN 16 + DB 129,128,129,128,129,128,129,128,129,128; addl $0x80818081,-0x7f7e7f7f(%rax) + DB 129 ; .byte 0x81 + DB 128 ; .byte 0x80 + DB 129 ; .byte 0x81 + DB 128 ; .byte 0x80 + DB 129 ; .byte 0x81 + DB 128 ; .byte 0x80 ENDIF END diff --git a/src/jumper/SkJumper_stages_lowp.cpp b/src/jumper/SkJumper_stages_lowp.cpp new file mode 100644 index 0000000000..1abaff76b4 --- /dev/null +++ b/src/jumper/SkJumper_stages_lowp.cpp @@ -0,0 +1,180 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "SkJumper.h" +#include "SkJumper_misc.h" +#include <immintrin.h> + +#if !defined(__SSSE3__) || !defined(__clang__) || !defined(__x86_64__) + #error "We're starting with just SSSE3 x86-64 for now, and will always require Clang." +#endif + +#define WRAP(name) sk_##name##_ssse3_lowp + +using K = const SkJumper_constants; +static const size_t kStride = 8; + +template <typename T> using V = T __attribute__((ext_vector_type(8))); +using U8 = V<uint8_t>; +using U16 = V<uint16_t>; +using U32 = V<uint32_t>; + +// See SkFixed15.h for details on this format and its operations. +struct F { + U16 vec; + + F() = default; + F(uint16_t bits) : vec(bits) {} + + F(U16 v) : vec(v) {} + operator U16() const { return vec; } +}; + +SI F operator+(F x, F y) { return x.vec + y.vec; } +SI F operator-(F x, F y) { return x.vec - y.vec; } +SI F operator*(F x, F y) { return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec)); } +SI F mad(F f, F m, F a) { return f*m+a; } + +SI F operator<<(F x, int bits) { return x.vec << bits; } +SI F operator>>(F x, int bits) { return x.vec >> bits; } + +using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F); + +MAYBE_MSABI +extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t limit, void** program, K* k) { + F v{}; + auto start = (Stage*)load_and_inc(program); + while (x + kStride <= limit) { + start(k,program,x,y,0, v,v,v,v, v,v,v,v); + x += kStride; + } + if (size_t tail = limit - x) { + start(k,program,x,y,tail, v,v,v,v, v,v,v,v); + } +} +extern "C" void WRAP(just_return)(K*, void**, size_t,size_t,size_t, F,F,F,F, F,F,F,F) {} + +#define STAGE(name) \ + SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail, \ + F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ + extern "C" void WRAP(name)(K* k, void** program, size_t x, size_t y, size_t tail, \ + F r, F g, F b, F a, F dr, F dg, F db, F da) { \ + LazyCtx ctx(program); \ + name##_k(k,ctx,x,y,tail, r,g,b,a, dr,dg,db,da); \ + auto next = (Stage*)load_and_inc(program); \ + next(k,program,x,y,tail, r,g,b,a, dr,dg,db,da); \ + } \ + SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail, \ + F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) + + +// Helper functions used by multiple stages. + +template <typename V, typename T> +SI V load(const T* src, size_t tail) { +#if defined(JUMPER) + __builtin_assume(tail < kStride); + if (__builtin_expect(tail, 0)) { + V v{}; // Any inactive lanes are zeroed. + switch (tail-1) { + case 6: v[6] = src[6]; + case 5: v[5] = src[5]; + case 4: v[4] = src[4]; + case 3: v[3] = src[3]; + case 2: v[2] = src[2]; + case 1: v[1] = src[1]; + case 0: v[0] = src[0]; + } + return v; + } +#endif + return unaligned_load<V>(src); +} + +template <typename V, typename T> +SI void store(T* dst, V v, size_t tail) { +#if defined(JUMPER) + __builtin_assume(tail < kStride); + if (__builtin_expect(tail, 0)) { + switch (tail-1) { + case 6: dst[6] = v[6]; + case 5: dst[5] = v[5]; + case 4: dst[4] = v[4]; + case 3: dst[3] = v[3]; + case 2: dst[2] = v[2]; + case 1: dst[1] = v[1]; + case 0: dst[0] = v[0]; + } + return; + } +#endif + unaligned_store(dst, v); +} + +SI void from_8888(U32 rgba, F* r, F* g, F* b, F* a) { + // Split the 8 pixels into low and high halves, and reinterpret as vectors of 16-bit values. + U16 lo = unaligned_load<U16>((const uint32_t*)&rgba + 0), + hi = unaligned_load<U16>((const uint32_t*)&rgba + 4); + + U16 _0415 = _mm_unpacklo_epi8(lo, hi), // r0 r4 g0 g4 b0 b4 a0 a4 r1 r5 g1 g5 b1 b5 a1 a5 + _2637 = _mm_unpackhi_epi8(lo, hi); + + U16 even = _mm_unpacklo_epi8(_0415, _2637), // r0 r2 r4 r6 g0 g2 g4 g6 b0 b2 b4 b6 a0 a2 a4 a6 + odd = _mm_unpackhi_epi8(_0415, _2637); + + U16 rg = _mm_unpacklo_epi8(even, odd), // r0 r1 r2 r3 r4 r5 r6 r7 g0 g1 g2 g3 g4 g5 g6 g7 + ba = _mm_unpackhi_epi8(even, odd); + + // Unpack as 16-bit values into the high half of each 16-bit lane, to get a free *256. + U16 R = _mm_unpacklo_epi8(U16(0), rg), + G = _mm_unpackhi_epi8(U16(0), rg), + B = _mm_unpacklo_epi8(U16(0), ba), + A = _mm_unpackhi_epi8(U16(0), ba); + + // Now we scale from [0,255] to [0,32768]. Ideally that's 32768/255 = 128.50196, + // but we can approximate that very cheaply as 256*32897/65536 = 128.50391. + // 0 and 255 map to 0 and 32768 correctly, and nothing else is off by more than 1. + *r = _mm_mulhi_epu16(R, U16(32897)); + *g = _mm_mulhi_epu16(G, U16(32897)); + *b = _mm_mulhi_epu16(B, U16(32897)); + *a = _mm_mulhi_epu16(A, U16(32897)); +} +SI U32 to_8888(F r, F g, F b, F a) { + // We want to interlace and pack these values from [0,32768] to [0,255]. + // Luckily the simplest possible thing works great: >>7, then saturate. + // The 'u' in packus handles the saturation to [0,255] we need. + U16 rb = _mm_packus_epi16(r>>7,b>>7), // r0 r1 r2 r3 r4 r5 r6 r7 b0 b1 b2 b3 b4 b5 b6 b7 + ga = _mm_packus_epi16(g>>7,a>>7); + + U16 rg = _mm_unpacklo_epi8(rb, ga), // r0 g0 r1 g1 ... r7 g7 + ba = _mm_unpackhi_epi8(rb, ga); // b0 a0 ... b7 a7 + + U16 lo = _mm_unpacklo_epi16(rg, ba), // r0 g0 b0 a0 ... r3 g3 b3 a3 + hi = _mm_unpackhi_epi16(rg, ba); // r4 g4 b4 a4 ... r7 g7 b7 a7 + + U32 px; + memcpy((uint32_t*)&px + 0, &lo, sizeof(lo)); + memcpy((uint32_t*)&px + 4, &hi, sizeof(hi)); + return px; +} + +// Stages! + +STAGE(load_8888) { + auto ptr = *(const uint32_t**)ctx + x; + from_8888(load<U32>(ptr, tail), &r,&g,&b,&a); +} +STAGE(store_8888) { + auto ptr = *(uint32_t**)ctx + x; + store(ptr, to_8888(r,g,b,a), tail); +} + +STAGE(swap_rb) { + auto tmp = r; + r = b; + b = tmp; +} diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py index 301295fa68..971087b3d5 100755 --- a/src/jumper/build_stages.py +++ b/src/jumper/build_stages.py @@ -29,6 +29,14 @@ subprocess.check_call(clang + cflags + sse2 + win + ['-c', 'src/jumper/SkJumper_stages.cpp'] + ['-o', 'win_sse2.o']) +ssse3 = ['-mssse3', '-mno-sse4.1'] +subprocess.check_call(clang + cflags + ssse3 + + ['-c', 'src/jumper/SkJumper_stages_lowp.cpp'] + + ['-o', 'lowp_ssse3.o']) +subprocess.check_call(clang + cflags + ssse3 + win + + ['-c', 'src/jumper/SkJumper_stages_lowp.cpp'] + + ['-o', 'win_lowp_ssse3.o']) + sse41 = ['-msse4.1'] subprocess.check_call(clang + cflags + sse41 + ['-c', 'src/jumper/SkJumper_stages.cpp'] + @@ -187,6 +195,8 @@ print 'BALIGN32' parse_object_file('sse41.o', '.byte') print 'BALIGN32' parse_object_file('sse2.o', '.byte') +print 'BALIGN32' +parse_object_file('lowp_ssse3.o', '.byte') print '#endif' @@ -210,5 +220,7 @@ print 'ALIGN 32' parse_object_file('win_sse41.o', 'DB') print 'ALIGN 32' parse_object_file('win_sse2.o', 'DB') +print 'ALIGN 32' +parse_object_file('win_lowp_ssse3.o', 'DB') print 'ENDIF' print 'END' |