aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--src/core/SkRasterPipeline.h6
-rw-r--r--src/jumper/SkJumper.cpp74
-rw-r--r--src/jumper/SkJumper_generated.S261
-rw-r--r--src/jumper/SkJumper_generated_win.S269
-rw-r--r--src/jumper/SkJumper_stages_lowp.cpp180
-rwxr-xr-xsrc/jumper/build_stages.py12
-rw-r--r--tests/SkRasterPipelineTest.cpp28
7 files changed, 809 insertions, 21 deletions
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 353039171f..bd54e3f4df 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -16,7 +16,7 @@
#include <functional>
#include <vector>
-struct SkJumper_Engine;
+struct SkJumper_constants;
/**
* SkRasterPipeline provides a cheap way to chain together a pixel processing pipeline.
@@ -149,13 +149,15 @@ public:
bool empty() const { return fStages == nullptr; }
private:
+ using StartPipelineFn = void(size_t,size_t,size_t,void**,const SkJumper_constants*);
+
struct StageList {
StageList* prev;
StockStage stage;
void* ctx;
};
- void build_pipeline(const SkJumper_Engine&, void**) const;
+ StartPipelineFn* build_pipeline(void**) const;
void unchecked_append(StockStage, void*);
SkArenaAlloc* fAlloc;
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index c5840d3522..77a5856027 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -29,6 +29,7 @@ static K kConstants = {
// We can't express the real types of most stage functions portably, so we use a stand-in.
// We'll only ever call start_pipeline(), which then chains into the rest for us.
using StageFn = void(void);
+using StartPipelineFn = void(size_t,size_t,size_t,void**,K*);
// Some platforms expect C "name" maps to asm "_name", others to "name".
#if defined(__APPLE__)
@@ -43,29 +44,31 @@ extern "C" {
// We'll just run portable code.
#elif defined(__aarch64__)
- void ASM(start_pipeline,aarch64)(size_t,size_t,size_t, void**, K*);
+ StartPipelineFn ASM(start_pipeline,aarch64);
StageFn ASM(just_return,aarch64);
#define M(st) StageFn ASM(st,aarch64);
SK_RASTER_PIPELINE_STAGES(M)
#undef M
#elif defined(__arm__)
- void ASM(start_pipeline,vfp4)(size_t,size_t,size_t, void**, K*);
+ StartPipelineFn ASM(start_pipeline,vfp4);
StageFn ASM(just_return,vfp4);
#define M(st) StageFn ASM(st,vfp4);
SK_RASTER_PIPELINE_STAGES(M)
#undef M
#elif defined(__x86_64__) || defined(_M_X64)
- void ASM(start_pipeline,hsw )(size_t,size_t,size_t, void**, K*);
- void ASM(start_pipeline,avx )(size_t,size_t,size_t, void**, K*);
- void ASM(start_pipeline,sse41)(size_t,size_t,size_t, void**, K*);
- void ASM(start_pipeline,sse2 )(size_t,size_t,size_t, void**, K*);
+ StartPipelineFn ASM(start_pipeline,hsw ),
+ ASM(start_pipeline,avx ),
+ ASM(start_pipeline,sse41 ),
+ ASM(start_pipeline,sse2 ),
+ ASM(start_pipeline,ssse3_lowp);
StageFn ASM(just_return,hsw),
ASM(just_return,avx),
ASM(just_return,sse41),
- ASM(just_return,sse2);
+ ASM(just_return,sse2),
+ ASM(just_return,ssse3_lowp);
#define M(st) StageFn ASM(st,hsw);
SK_RASTER_PIPELINE_STAGES(M)
@@ -79,10 +82,14 @@ extern "C" {
#define M(st) StageFn ASM(st,sse2);
SK_RASTER_PIPELINE_STAGES(M)
#undef M
+
+ StageFn ASM(load_8888, ssse3_lowp),
+ ASM(store_8888, ssse3_lowp),
+ ASM(swap_rb, ssse3_lowp);
#endif
// Portable, single-pixel stages.
- void sk_start_pipeline(size_t,size_t,size_t, void**, K*);
+ StartPipelineFn sk_start_pipeline;
StageFn sk_just_return;
#define M(st) StageFn sk_##st;
SK_RASTER_PIPELINE_STAGES(M)
@@ -170,43 +177,72 @@ static SkJumper_Engine choose_engine() {
return kPortable;
}
-void SkRasterPipeline::build_pipeline(const SkJumper_Engine& engine, void** ip) const {
- const StageList* st = fStages;
+StartPipelineFn* SkRasterPipeline::build_pipeline(void** ip) const {
+#if defined(__x86_64__) || defined(_M_X64)
+ if (SkCpu::Supports(SkCpu::SSSE3)) {
+ void** reset_point = ip;
+
+ *--ip = (void*)ASM(just_return,ssse3_lowp);
+ for (const StageList* st = fStages; st; st = st->prev) {
+ StageFn* fn = nullptr;
+ switch (st->stage) {
+ case SkRasterPipeline::load_8888: fn = ASM(load_8888, ssse3_lowp); break;
+ case SkRasterPipeline::store_8888: fn = ASM(store_8888,ssse3_lowp); break;
+ case SkRasterPipeline::swap_rb: fn = ASM(swap_rb, ssse3_lowp); break;
+ default:
+ //SkDebugf("can't %d\n", st->stage);
+ ip = reset_point;
+ }
+ if (ip == reset_point) {
+ break;
+ }
+ if (st->ctx) {
+ *--ip = st->ctx;
+ }
+ *--ip = (void*)fn;
+ }
+
+ if (ip != reset_point) {
+ return ASM(start_pipeline,ssse3_lowp);
+ }
+ }
+#endif
+ gChooseEngineOnce([]{ gEngine = choose_engine(); });
+
// We're building the pipeline backwards, so we start with the final stage just_return.
- *--ip = (void*)engine.just_return;
+ *--ip = (void*)gEngine.just_return;
// Still going backwards, each stage's context pointer then its StageFn.
- for (; st; st = st->prev) {
+ for (const StageList* st = fStages; st; st = st->prev) {
if (st->ctx) {
*--ip = st->ctx;
}
- *--ip = (void*)engine.stages[st->stage];
+ *--ip = (void*)gEngine.stages[st->stage];
}
+ return gEngine.start_pipeline;
}
void SkRasterPipeline::run(size_t x, size_t y, size_t n) const {
if (this->empty()) {
return;
}
- gChooseEngineOnce([]{ gEngine = choose_engine(); });
// Best to not use fAlloc here... we can't bound how often run() will be called.
SkAutoSTMalloc<64, void*> program(fSlotsNeeded);
- this->build_pipeline(gEngine, program.get() + fSlotsNeeded);
- gEngine.start_pipeline(x,y,x+n, program.get(), &kConstants);
+ auto start_pipeline = this->build_pipeline(program.get() + fSlotsNeeded);
+ start_pipeline(x,y,x+n, program.get(), &kConstants);
}
std::function<void(size_t, size_t, size_t)> SkRasterPipeline::compile() const {
if (this->empty()) {
return [](size_t, size_t, size_t) {};
}
- gChooseEngineOnce([]{ gEngine = choose_engine(); });
void** program = fAlloc->makeArray<void*>(fSlotsNeeded);
- this->build_pipeline(gEngine, program + fSlotsNeeded);
+ auto start_pipeline = this->build_pipeline(program + fSlotsNeeded);
return [=](size_t x, size_t y, size_t n) {
- gEngine.start_pipeline(x,y,x+n, program, &kConstants);
+ start_pipeline(x,y,x+n, program, &kConstants);
};
}
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index 8379663481..84eb7a5e13 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -36884,4 +36884,265 @@ BALIGN16
.byte 170 // stos %al,%es:(%rdi)
.byte 170 // stos %al,%es:(%rdi)
.byte 190,171,170,170,190 // mov $0xbeaaaaab,%esi
+BALIGN32
+
+HIDDEN _sk_start_pipeline_ssse3_lowp
+.globl _sk_start_pipeline_ssse3_lowp
+FUNCTION(_sk_start_pipeline_ssse3_lowp)
+_sk_start_pipeline_ssse3_lowp:
+ .byte 85 // push %rbp
+ .byte 65,87 // push %r15
+ .byte 65,86 // push %r14
+ .byte 65,85 // push %r13
+ .byte 65,84 // push %r12
+ .byte 83 // push %rbx
+ .byte 80 // push %rax
+ .byte 77,137,198 // mov %r8,%r14
+ .byte 73,137,213 // mov %rdx,%r13
+ .byte 73,137,247 // mov %rsi,%r15
+ .byte 72,137,251 // mov %rdi,%rbx
+ .byte 72,137,206 // mov %rcx,%rsi
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 73,137,244 // mov %rsi,%r12
+ .byte 72,141,75,8 // lea 0x8(%rbx),%rcx
+ .byte 76,57,233 // cmp %r13,%rcx
+ .byte 118,5 // jbe 2d <_sk_start_pipeline_ssse3_lowp+0x2d>
+ .byte 72,137,218 // mov %rbx,%rdx
+ .byte 235,66 // jmp 6f <_sk_start_pipeline_ssse3_lowp+0x6f>
+ .byte 65,184,0,0,0,0 // mov $0x0,%r8d
+ .byte 15,87,192 // xorps %xmm0,%xmm0
+ .byte 15,87,201 // xorps %xmm1,%xmm1
+ .byte 15,87,210 // xorps %xmm2,%xmm2
+ .byte 15,87,219 // xorps %xmm3,%xmm3
+ .byte 15,87,228 // xorps %xmm4,%xmm4
+ .byte 15,87,237 // xorps %xmm5,%xmm5
+ .byte 15,87,246 // xorps %xmm6,%xmm6
+ .byte 15,87,255 // xorps %xmm7,%xmm7
+ .byte 76,137,247 // mov %r14,%rdi
+ .byte 76,137,230 // mov %r12,%rsi
+ .byte 72,137,218 // mov %rbx,%rdx
+ .byte 76,137,249 // mov %r15,%rcx
+ .byte 72,137,197 // mov %rax,%rbp
+ .byte 255,213 // callq *%rbp
+ .byte 72,137,232 // mov %rbp,%rax
+ .byte 72,141,83,8 // lea 0x8(%rbx),%rdx
+ .byte 72,131,195,16 // add $0x10,%rbx
+ .byte 76,57,235 // cmp %r13,%rbx
+ .byte 72,137,211 // mov %rdx,%rbx
+ .byte 118,190 // jbe 2d <_sk_start_pipeline_ssse3_lowp+0x2d>
+ .byte 73,41,213 // sub %rdx,%r13
+ .byte 116,52 // je a8 <_sk_start_pipeline_ssse3_lowp+0xa8>
+ .byte 15,87,192 // xorps %xmm0,%xmm0
+ .byte 15,87,201 // xorps %xmm1,%xmm1
+ .byte 15,87,210 // xorps %xmm2,%xmm2
+ .byte 15,87,219 // xorps %xmm3,%xmm3
+ .byte 15,87,228 // xorps %xmm4,%xmm4
+ .byte 15,87,237 // xorps %xmm5,%xmm5
+ .byte 15,87,246 // xorps %xmm6,%xmm6
+ .byte 15,87,255 // xorps %xmm7,%xmm7
+ .byte 76,137,247 // mov %r14,%rdi
+ .byte 76,137,230 // mov %r12,%rsi
+ .byte 76,137,249 // mov %r15,%rcx
+ .byte 77,137,232 // mov %r13,%r8
+ .byte 72,131,196,8 // add $0x8,%rsp
+ .byte 91 // pop %rbx
+ .byte 65,92 // pop %r12
+ .byte 65,93 // pop %r13
+ .byte 65,94 // pop %r14
+ .byte 65,95 // pop %r15
+ .byte 93 // pop %rbp
+ .byte 255,224 // jmpq *%rax
+ .byte 72,131,196,8 // add $0x8,%rsp
+ .byte 91 // pop %rbx
+ .byte 65,92 // pop %r12
+ .byte 65,93 // pop %r13
+ .byte 65,94 // pop %r14
+ .byte 65,95 // pop %r15
+ .byte 93 // pop %rbp
+ .byte 195 // retq
+
+HIDDEN _sk_just_return_ssse3_lowp
+.globl _sk_just_return_ssse3_lowp
+FUNCTION(_sk_just_return_ssse3_lowp)
+_sk_just_return_ssse3_lowp:
+ .byte 195 // retq
+
+HIDDEN _sk_load_8888_ssse3_lowp
+.globl _sk_load_8888_ssse3_lowp
+FUNCTION(_sk_load_8888_ssse3_lowp)
+_sk_load_8888_ssse3_lowp:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 76,139,24 // mov (%rax),%r11
+ .byte 77,133,192 // test %r8,%r8
+ .byte 117,120 // jne 13a <_sk_load_8888_ssse3_lowp+0x82>
+ .byte 65,15,16,76,147,16 // movups 0x10(%r11,%rdx,4),%xmm1
+ .byte 65,15,16,4,147 // movups (%r11,%rdx,4),%xmm0
+ .byte 15,40,208 // movaps %xmm0,%xmm2
+ .byte 102,15,96,209 // punpcklbw %xmm1,%xmm2
+ .byte 102,15,104,193 // punpckhbw %xmm1,%xmm0
+ .byte 102,68,15,111,194 // movdqa %xmm2,%xmm8
+ .byte 102,68,15,96,192 // punpcklbw %xmm0,%xmm8
+ .byte 102,15,104,208 // punpckhbw %xmm0,%xmm2
+ .byte 102,69,15,111,200 // movdqa %xmm8,%xmm9
+ .byte 102,68,15,96,202 // punpcklbw %xmm2,%xmm9
+ .byte 102,68,15,104,194 // punpckhbw %xmm2,%xmm8
+ .byte 102,15,239,219 // pxor %xmm3,%xmm3
+ .byte 102,15,239,192 // pxor %xmm0,%xmm0
+ .byte 102,65,15,96,193 // punpcklbw %xmm9,%xmm0
+ .byte 102,15,239,201 // pxor %xmm1,%xmm1
+ .byte 102,65,15,104,201 // punpckhbw %xmm9,%xmm1
+ .byte 102,15,239,210 // pxor %xmm2,%xmm2
+ .byte 102,65,15,96,208 // punpcklbw %xmm8,%xmm2
+ .byte 102,65,15,104,216 // punpckhbw %xmm8,%xmm3
+ .byte 102,68,15,111,5,238,1,0,0 // movdqa 0x1ee(%rip),%xmm8 # 310 <_sk_swap_rb_ssse3_lowp+0x1c>
+ .byte 102,65,15,228,192 // pmulhuw %xmm8,%xmm0
+ .byte 102,65,15,228,200 // pmulhuw %xmm8,%xmm1
+ .byte 102,65,15,228,208 // pmulhuw %xmm8,%xmm2
+ .byte 102,65,15,228,216 // pmulhuw %xmm8,%xmm3
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 69,137,193 // mov %r8d,%r9d
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 102,15,239,201 // pxor %xmm1,%xmm1
+ .byte 102,15,239,192 // pxor %xmm0,%xmm0
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 15,135,119,255,255,255 // ja cd <_sk_load_8888_ssse3_lowp+0x15>
+ .byte 69,15,182,201 // movzbl %r9b,%r9d
+ .byte 76,141,21,119,0,0,0 // lea 0x77(%rip),%r10 # 1d8 <_sk_load_8888_ssse3_lowp+0x120>
+ .byte 75,99,4,138 // movslq (%r10,%r9,4),%rax
+ .byte 76,1,208 // add %r10,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 102,65,15,110,68,147,24 // movd 0x18(%r11,%rdx,4),%xmm0
+ .byte 102,15,112,200,69 // pshufd $0x45,%xmm0,%xmm1
+ .byte 102,15,239,192 // pxor %xmm0,%xmm0
+ .byte 243,65,15,16,84,147,20 // movss 0x14(%r11,%rdx,4),%xmm2
+ .byte 15,198,209,0 // shufps $0x0,%xmm1,%xmm2
+ .byte 15,198,209,226 // shufps $0xe2,%xmm1,%xmm2
+ .byte 15,40,202 // movaps %xmm2,%xmm1
+ .byte 243,65,15,16,84,147,16 // movss 0x10(%r11,%rdx,4),%xmm2
+ .byte 243,15,16,202 // movss %xmm2,%xmm1
+ .byte 243,65,15,16,84,147,12 // movss 0xc(%r11,%rdx,4),%xmm2
+ .byte 15,198,208,32 // shufps $0x20,%xmm0,%xmm2
+ .byte 15,198,194,36 // shufps $0x24,%xmm2,%xmm0
+ .byte 243,65,15,16,84,147,8 // movss 0x8(%r11,%rdx,4),%xmm2
+ .byte 15,198,208,48 // shufps $0x30,%xmm0,%xmm2
+ .byte 15,198,194,132 // shufps $0x84,%xmm2,%xmm0
+ .byte 243,65,15,16,84,147,4 // movss 0x4(%r11,%rdx,4),%xmm2
+ .byte 15,198,208,0 // shufps $0x0,%xmm0,%xmm2
+ .byte 15,198,208,226 // shufps $0xe2,%xmm0,%xmm2
+ .byte 15,40,194 // movaps %xmm2,%xmm0
+ .byte 243,65,15,16,20,147 // movss (%r11,%rdx,4),%xmm2
+ .byte 243,15,16,194 // movss %xmm2,%xmm0
+ .byte 233,247,254,255,255 // jmpq cd <_sk_load_8888_ssse3_lowp+0x15>
+ .byte 102,144 // xchg %ax,%ax
+ .byte 239 // out %eax,(%dx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 221,255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,206 // dec %esi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 191,255,255,255,180 // mov $0xb4ffffff,%edi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,162,255,255,255,146 // jmpq *-0x6d000001(%rdx)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
+
+HIDDEN _sk_store_8888_ssse3_lowp
+.globl _sk_store_8888_ssse3_lowp
+FUNCTION(_sk_store_8888_ssse3_lowp)
+_sk_store_8888_ssse3_lowp:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 76,139,24 // mov (%rax),%r11
+ .byte 102,68,15,111,208 // movdqa %xmm0,%xmm10
+ .byte 102,65,15,113,210,7 // psrlw $0x7,%xmm10
+ .byte 102,68,15,111,194 // movdqa %xmm2,%xmm8
+ .byte 102,65,15,113,208,7 // psrlw $0x7,%xmm8
+ .byte 102,69,15,103,208 // packuswb %xmm8,%xmm10
+ .byte 102,68,15,111,193 // movdqa %xmm1,%xmm8
+ .byte 102,65,15,113,208,7 // psrlw $0x7,%xmm8
+ .byte 102,68,15,111,203 // movdqa %xmm3,%xmm9
+ .byte 102,65,15,113,209,7 // psrlw $0x7,%xmm9
+ .byte 102,69,15,103,193 // packuswb %xmm9,%xmm8
+ .byte 102,69,15,111,202 // movdqa %xmm10,%xmm9
+ .byte 102,69,15,96,200 // punpcklbw %xmm8,%xmm9
+ .byte 102,69,15,104,208 // punpckhbw %xmm8,%xmm10
+ .byte 102,69,15,111,193 // movdqa %xmm9,%xmm8
+ .byte 102,69,15,97,194 // punpcklwd %xmm10,%xmm8
+ .byte 102,69,15,105,202 // punpckhwd %xmm10,%xmm9
+ .byte 77,133,192 // test %r8,%r8
+ .byte 117,17 // jne 263 <_sk_store_8888_ssse3_lowp+0x6f>
+ .byte 243,69,15,127,76,147,16 // movdqu %xmm9,0x10(%r11,%rdx,4)
+ .byte 243,69,15,127,4,147 // movdqu %xmm8,(%r11,%rdx,4)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 69,137,193 // mov %r8d,%r9d
+ .byte 65,128,225,7 // and $0x7,%r9b
+ .byte 65,254,201 // dec %r9b
+ .byte 65,128,249,6 // cmp $0x6,%r9b
+ .byte 119,236 // ja 25f <_sk_store_8888_ssse3_lowp+0x6b>
+ .byte 69,15,182,201 // movzbl %r9b,%r9d
+ .byte 76,141,21,90,0,0,0 // lea 0x5a(%rip),%r10 # 2d8 <_sk_store_8888_ssse3_lowp+0xe4>
+ .byte 75,99,4,138 // movslq (%r10,%r9,4),%rax
+ .byte 76,1,208 // add %r10,%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 102,69,15,112,209,78 // pshufd $0x4e,%xmm9,%xmm10
+ .byte 102,69,15,126,84,147,24 // movd %xmm10,0x18(%r11,%rdx,4)
+ .byte 102,69,15,112,209,229 // pshufd $0xe5,%xmm9,%xmm10
+ .byte 102,69,15,126,84,147,20 // movd %xmm10,0x14(%r11,%rdx,4)
+ .byte 102,69,15,126,76,147,16 // movd %xmm9,0x10(%r11,%rdx,4)
+ .byte 102,69,15,112,200,231 // pshufd $0xe7,%xmm8,%xmm9
+ .byte 102,69,15,126,76,147,12 // movd %xmm9,0xc(%r11,%rdx,4)
+ .byte 102,69,15,112,200,78 // pshufd $0x4e,%xmm8,%xmm9
+ .byte 102,69,15,126,76,147,8 // movd %xmm9,0x8(%r11,%rdx,4)
+ .byte 102,69,15,112,200,229 // pshufd $0xe5,%xmm8,%xmm9
+ .byte 102,69,15,126,76,147,4 // movd %xmm9,0x4(%r11,%rdx,4)
+ .byte 102,69,15,126,4,147 // movd %xmm8,(%r11,%rdx,4)
+ .byte 235,136 // jmp 25f <_sk_store_8888_ssse3_lowp+0x6b>
+ .byte 144 // nop
+ .byte 247,255 // idiv %edi
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 234 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 221,255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,208 // callq *%rax
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255,201 // dec %ecx
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 188,255,255,255,175 // mov $0xafffffff,%esp
+ .byte 255 // (bad)
+ .byte 255 // (bad)
+ .byte 255 // .byte 0xff
+
+HIDDEN _sk_swap_rb_ssse3_lowp
+.globl _sk_swap_rb_ssse3_lowp
+FUNCTION(_sk_swap_rb_ssse3_lowp)
+_sk_swap_rb_ssse3_lowp:
+ .byte 68,15,40,192 // movaps %xmm0,%xmm8
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 15,40,194 // movaps %xmm2,%xmm0
+ .byte 65,15,40,208 // movaps %xmm8,%xmm2
+ .byte 255,224 // jmpq *%rax
+
+BALIGN16
+ .byte 129,128,129,128,129,128,129,128,129,128// addl $0x80818081,-0x7f7e7f7f(%rax)
+ .byte 129 // .byte 0x81
+ .byte 128 // .byte 0x80
+ .byte 129 // .byte 0x81
+ .byte 128 // .byte 0x80
+ .byte 129 // .byte 0x81
+ .byte 128 // .byte 0x80
#endif
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index 33d209ea38..ebdd645410 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -26262,5 +26262,274 @@ ALIGN 16
DB 170 ; stos %al,%es:(%rdi)
DB 170 ; stos %al,%es:(%rdi)
DB 190,171,170,170,190 ; mov $0xbeaaaaab,%esi
+ALIGN 32
+
+PUBLIC _sk_start_pipeline_ssse3_lowp
+_sk_start_pipeline_ssse3_lowp LABEL PROC
+ DB 65,87 ; push %r15
+ DB 65,86 ; push %r14
+ DB 65,85 ; push %r13
+ DB 65,84 ; push %r12
+ DB 86 ; push %rsi
+ DB 87 ; push %rdi
+ DB 85 ; push %rbp
+ DB 83 ; push %rbx
+ DB 72,129,236,168,0,0,0 ; sub $0xa8,%rsp
+ DB 68,15,41,188,36,144,0,0,0 ; movaps %xmm15,0x90(%rsp)
+ DB 68,15,41,180,36,128,0,0,0 ; movaps %xmm14,0x80(%rsp)
+ DB 68,15,41,108,36,112 ; movaps %xmm13,0x70(%rsp)
+ DB 68,15,41,100,36,96 ; movaps %xmm12,0x60(%rsp)
+ DB 68,15,41,92,36,80 ; movaps %xmm11,0x50(%rsp)
+ DB 68,15,41,84,36,64 ; movaps %xmm10,0x40(%rsp)
+ DB 68,15,41,76,36,48 ; movaps %xmm9,0x30(%rsp)
+ DB 68,15,41,68,36,32 ; movaps %xmm8,0x20(%rsp)
+ DB 15,41,124,36,16 ; movaps %xmm7,0x10(%rsp)
+ DB 15,41,52,36 ; movaps %xmm6,(%rsp)
+ DB 76,137,195 ; mov %r8,%rbx
+ DB 73,137,214 ; mov %rdx,%r14
+ DB 72,137,205 ; mov %rcx,%rbp
+ DB 76,139,188,36,16,1,0,0 ; mov 0x110(%rsp),%r15
+ DB 76,137,206 ; mov %r9,%rsi
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 73,137,196 ; mov %rax,%r12
+ DB 73,137,245 ; mov %rsi,%r13
+ DB 72,141,69,8 ; lea 0x8(%rbp),%rax
+ DB 72,57,216 ; cmp %rbx,%rax
+ DB 118,5 ; jbe 7c <_sk_start_pipeline_ssse3_lowp+0x7c>
+ DB 72,137,234 ; mov %rbp,%rdx
+ DB 235,61 ; jmp b9 <_sk_start_pipeline_ssse3_lowp+0xb9>
+ DB 65,184,0,0,0,0 ; mov $0x0,%r8d
+ DB 15,87,192 ; xorps %xmm0,%xmm0
+ DB 15,87,201 ; xorps %xmm1,%xmm1
+ DB 15,87,210 ; xorps %xmm2,%xmm2
+ DB 15,87,219 ; xorps %xmm3,%xmm3
+ DB 15,87,228 ; xorps %xmm4,%xmm4
+ DB 15,87,237 ; xorps %xmm5,%xmm5
+ DB 15,87,246 ; xorps %xmm6,%xmm6
+ DB 15,87,255 ; xorps %xmm7,%xmm7
+ DB 76,137,255 ; mov %r15,%rdi
+ DB 76,137,238 ; mov %r13,%rsi
+ DB 72,137,234 ; mov %rbp,%rdx
+ DB 76,137,241 ; mov %r14,%rcx
+ DB 65,255,212 ; callq *%r12
+ DB 72,141,85,8 ; lea 0x8(%rbp),%rdx
+ DB 72,131,197,16 ; add $0x10,%rbp
+ DB 72,57,221 ; cmp %rbx,%rbp
+ DB 72,137,213 ; mov %rdx,%rbp
+ DB 118,195 ; jbe 7c <_sk_start_pipeline_ssse3_lowp+0x7c>
+ DB 72,41,211 ; sub %rdx,%rbx
+ DB 116,39 ; je e5 <_sk_start_pipeline_ssse3_lowp+0xe5>
+ DB 15,87,192 ; xorps %xmm0,%xmm0
+ DB 15,87,201 ; xorps %xmm1,%xmm1
+ DB 15,87,210 ; xorps %xmm2,%xmm2
+ DB 15,87,219 ; xorps %xmm3,%xmm3
+ DB 15,87,228 ; xorps %xmm4,%xmm4
+ DB 15,87,237 ; xorps %xmm5,%xmm5
+ DB 15,87,246 ; xorps %xmm6,%xmm6
+ DB 15,87,255 ; xorps %xmm7,%xmm7
+ DB 76,137,255 ; mov %r15,%rdi
+ DB 76,137,238 ; mov %r13,%rsi
+ DB 76,137,241 ; mov %r14,%rcx
+ DB 73,137,216 ; mov %rbx,%r8
+ DB 65,255,212 ; callq *%r12
+ DB 15,40,52,36 ; movaps (%rsp),%xmm6
+ DB 15,40,124,36,16 ; movaps 0x10(%rsp),%xmm7
+ DB 68,15,40,68,36,32 ; movaps 0x20(%rsp),%xmm8
+ DB 68,15,40,76,36,48 ; movaps 0x30(%rsp),%xmm9
+ DB 68,15,40,84,36,64 ; movaps 0x40(%rsp),%xmm10
+ DB 68,15,40,92,36,80 ; movaps 0x50(%rsp),%xmm11
+ DB 68,15,40,100,36,96 ; movaps 0x60(%rsp),%xmm12
+ DB 68,15,40,108,36,112 ; movaps 0x70(%rsp),%xmm13
+ DB 68,15,40,180,36,128,0,0,0 ; movaps 0x80(%rsp),%xmm14
+ DB 68,15,40,188,36,144,0,0,0 ; movaps 0x90(%rsp),%xmm15
+ DB 72,129,196,168,0,0,0 ; add $0xa8,%rsp
+ DB 91 ; pop %rbx
+ DB 93 ; pop %rbp
+ DB 95 ; pop %rdi
+ DB 94 ; pop %rsi
+ DB 65,92 ; pop %r12
+ DB 65,93 ; pop %r13
+ DB 65,94 ; pop %r14
+ DB 65,95 ; pop %r15
+ DB 195 ; retq
+
+PUBLIC _sk_just_return_ssse3_lowp
+_sk_just_return_ssse3_lowp LABEL PROC
+ DB 195 ; retq
+
+PUBLIC _sk_load_8888_ssse3_lowp
+_sk_load_8888_ssse3_lowp LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 76,139,24 ; mov (%rax),%r11
+ DB 77,133,192 ; test %r8,%r8
+ DB 117,120 ; jne 1bb <_sk_load_8888_ssse3_lowp+0x82>
+ DB 65,15,16,76,147,16 ; movups 0x10(%r11,%rdx,4),%xmm1
+ DB 65,15,16,4,147 ; movups (%r11,%rdx,4),%xmm0
+ DB 15,40,208 ; movaps %xmm0,%xmm2
+ DB 102,15,96,209 ; punpcklbw %xmm1,%xmm2
+ DB 102,15,104,193 ; punpckhbw %xmm1,%xmm0
+ DB 102,68,15,111,194 ; movdqa %xmm2,%xmm8
+ DB 102,68,15,96,192 ; punpcklbw %xmm0,%xmm8
+ DB 102,15,104,208 ; punpckhbw %xmm0,%xmm2
+ DB 102,69,15,111,200 ; movdqa %xmm8,%xmm9
+ DB 102,68,15,96,202 ; punpcklbw %xmm2,%xmm9
+ DB 102,68,15,104,194 ; punpckhbw %xmm2,%xmm8
+ DB 102,15,239,219 ; pxor %xmm3,%xmm3
+ DB 102,15,239,192 ; pxor %xmm0,%xmm0
+ DB 102,65,15,96,193 ; punpcklbw %xmm9,%xmm0
+ DB 102,15,239,201 ; pxor %xmm1,%xmm1
+ DB 102,65,15,104,201 ; punpckhbw %xmm9,%xmm1
+ DB 102,15,239,210 ; pxor %xmm2,%xmm2
+ DB 102,65,15,96,208 ; punpcklbw %xmm8,%xmm2
+ DB 102,65,15,104,216 ; punpckhbw %xmm8,%xmm3
+ DB 102,68,15,111,5,237,1,0,0 ; movdqa 0x1ed(%rip),%xmm8 # 390 <_sk_swap_rb_ssse3_lowp+0x1c>
+ DB 102,65,15,228,192 ; pmulhuw %xmm8,%xmm0
+ DB 102,65,15,228,200 ; pmulhuw %xmm8,%xmm1
+ DB 102,65,15,228,208 ; pmulhuw %xmm8,%xmm2
+ DB 102,65,15,228,216 ; pmulhuw %xmm8,%xmm3
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+ DB 69,137,193 ; mov %r8d,%r9d
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 102,15,239,201 ; pxor %xmm1,%xmm1
+ DB 102,15,239,192 ; pxor %xmm0,%xmm0
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 15,135,119,255,255,255 ; ja 14e <_sk_load_8888_ssse3_lowp+0x15>
+ DB 69,15,182,201 ; movzbl %r9b,%r9d
+ DB 76,141,21,118,0,0,0 ; lea 0x76(%rip),%r10 # 258 <_sk_load_8888_ssse3_lowp+0x11f>
+ DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax
+ DB 76,1,208 ; add %r10,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 102,65,15,110,68,147,24 ; movd 0x18(%r11,%rdx,4),%xmm0
+ DB 102,15,112,200,69 ; pshufd $0x45,%xmm0,%xmm1
+ DB 102,15,239,192 ; pxor %xmm0,%xmm0
+ DB 243,65,15,16,84,147,20 ; movss 0x14(%r11,%rdx,4),%xmm2
+ DB 15,198,209,0 ; shufps $0x0,%xmm1,%xmm2
+ DB 15,198,209,226 ; shufps $0xe2,%xmm1,%xmm2
+ DB 15,40,202 ; movaps %xmm2,%xmm1
+ DB 243,65,15,16,84,147,16 ; movss 0x10(%r11,%rdx,4),%xmm2
+ DB 243,15,16,202 ; movss %xmm2,%xmm1
+ DB 243,65,15,16,84,147,12 ; movss 0xc(%r11,%rdx,4),%xmm2
+ DB 15,198,208,32 ; shufps $0x20,%xmm0,%xmm2
+ DB 15,198,194,36 ; shufps $0x24,%xmm2,%xmm0
+ DB 243,65,15,16,84,147,8 ; movss 0x8(%r11,%rdx,4),%xmm2
+ DB 15,198,208,48 ; shufps $0x30,%xmm0,%xmm2
+ DB 15,198,194,132 ; shufps $0x84,%xmm2,%xmm0
+ DB 243,65,15,16,84,147,4 ; movss 0x4(%r11,%rdx,4),%xmm2
+ DB 15,198,208,0 ; shufps $0x0,%xmm0,%xmm2
+ DB 15,198,208,226 ; shufps $0xe2,%xmm0,%xmm2
+ DB 15,40,194 ; movaps %xmm2,%xmm0
+ DB 243,65,15,16,20,147 ; movss (%r11,%rdx,4),%xmm2
+ DB 243,15,16,194 ; movss %xmm2,%xmm0
+ DB 233,247,254,255,255 ; jmpq 14e <_sk_load_8888_ssse3_lowp+0x15>
+ DB 144 ; nop
+ DB 240,255 ; lock (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 222,255 ; fdivrp %st,%st(7)
+ DB 255 ; (bad)
+ DB 255,207 ; dec %edi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,192 ; inc %eax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,181,255,255,255,163 ; pushq -0x5c000001(%rbp)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
+ DB 147 ; xchg %eax,%ebx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
+
+PUBLIC _sk_store_8888_ssse3_lowp
+_sk_store_8888_ssse3_lowp LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 76,139,24 ; mov (%rax),%r11
+ DB 102,68,15,111,208 ; movdqa %xmm0,%xmm10
+ DB 102,65,15,113,210,7 ; psrlw $0x7,%xmm10
+ DB 102,68,15,111,194 ; movdqa %xmm2,%xmm8
+ DB 102,65,15,113,208,7 ; psrlw $0x7,%xmm8
+ DB 102,69,15,103,208 ; packuswb %xmm8,%xmm10
+ DB 102,68,15,111,193 ; movdqa %xmm1,%xmm8
+ DB 102,65,15,113,208,7 ; psrlw $0x7,%xmm8
+ DB 102,68,15,111,203 ; movdqa %xmm3,%xmm9
+ DB 102,65,15,113,209,7 ; psrlw $0x7,%xmm9
+ DB 102,69,15,103,193 ; packuswb %xmm9,%xmm8
+ DB 102,69,15,111,202 ; movdqa %xmm10,%xmm9
+ DB 102,69,15,96,200 ; punpcklbw %xmm8,%xmm9
+ DB 102,69,15,104,208 ; punpckhbw %xmm8,%xmm10
+ DB 102,69,15,111,193 ; movdqa %xmm9,%xmm8
+ DB 102,69,15,97,194 ; punpcklwd %xmm10,%xmm8
+ DB 102,69,15,105,202 ; punpckhwd %xmm10,%xmm9
+ DB 77,133,192 ; test %r8,%r8
+ DB 117,17 ; jne 2e3 <_sk_store_8888_ssse3_lowp+0x6f>
+ DB 243,69,15,127,76,147,16 ; movdqu %xmm9,0x10(%r11,%rdx,4)
+ DB 243,69,15,127,4,147 ; movdqu %xmm8,(%r11,%rdx,4)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+ DB 69,137,193 ; mov %r8d,%r9d
+ DB 65,128,225,7 ; and $0x7,%r9b
+ DB 65,254,201 ; dec %r9b
+ DB 65,128,249,6 ; cmp $0x6,%r9b
+ DB 119,236 ; ja 2df <_sk_store_8888_ssse3_lowp+0x6b>
+ DB 69,15,182,201 ; movzbl %r9b,%r9d
+ DB 76,141,21,90,0,0,0 ; lea 0x5a(%rip),%r10 # 358 <_sk_store_8888_ssse3_lowp+0xe4>
+ DB 75,99,4,138 ; movslq (%r10,%r9,4),%rax
+ DB 76,1,208 ; add %r10,%rax
+ DB 255,224 ; jmpq *%rax
+ DB 102,69,15,112,209,78 ; pshufd $0x4e,%xmm9,%xmm10
+ DB 102,69,15,126,84,147,24 ; movd %xmm10,0x18(%r11,%rdx,4)
+ DB 102,69,15,112,209,229 ; pshufd $0xe5,%xmm9,%xmm10
+ DB 102,69,15,126,84,147,20 ; movd %xmm10,0x14(%r11,%rdx,4)
+ DB 102,69,15,126,76,147,16 ; movd %xmm9,0x10(%r11,%rdx,4)
+ DB 102,69,15,112,200,231 ; pshufd $0xe7,%xmm8,%xmm9
+ DB 102,69,15,126,76,147,12 ; movd %xmm9,0xc(%r11,%rdx,4)
+ DB 102,69,15,112,200,78 ; pshufd $0x4e,%xmm8,%xmm9
+ DB 102,69,15,126,76,147,8 ; movd %xmm9,0x8(%r11,%rdx,4)
+ DB 102,69,15,112,200,229 ; pshufd $0xe5,%xmm8,%xmm9
+ DB 102,69,15,126,76,147,4 ; movd %xmm9,0x4(%r11,%rdx,4)
+ DB 102,69,15,126,4,147 ; movd %xmm8,(%r11,%rdx,4)
+ DB 235,136 ; jmp 2df <_sk_store_8888_ssse3_lowp+0x6b>
+ DB 144 ; nop
+ DB 247,255 ; idiv %edi
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 234 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 221,255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,208 ; callq *%rax
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255,201 ; dec %ecx
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 188,255,255,255,175 ; mov $0xafffffff,%esp
+ DB 255 ; (bad)
+ DB 255 ; (bad)
+ DB 255 ; .byte 0xff
+
+PUBLIC _sk_swap_rb_ssse3_lowp
+_sk_swap_rb_ssse3_lowp LABEL PROC
+ DB 68,15,40,192 ; movaps %xmm0,%xmm8
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 15,40,194 ; movaps %xmm2,%xmm0
+ DB 65,15,40,208 ; movaps %xmm8,%xmm2
+ DB 255,224 ; jmpq *%rax
+
+ALIGN 16
+ DB 129,128,129,128,129,128,129,128,129,128; addl $0x80818081,-0x7f7e7f7f(%rax)
+ DB 129 ; .byte 0x81
+ DB 128 ; .byte 0x80
+ DB 129 ; .byte 0x81
+ DB 128 ; .byte 0x80
+ DB 129 ; .byte 0x81
+ DB 128 ; .byte 0x80
ENDIF
END
diff --git a/src/jumper/SkJumper_stages_lowp.cpp b/src/jumper/SkJumper_stages_lowp.cpp
new file mode 100644
index 0000000000..1abaff76b4
--- /dev/null
+++ b/src/jumper/SkJumper_stages_lowp.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkJumper.h"
+#include "SkJumper_misc.h"
+#include <immintrin.h>
+
+#if !defined(__SSSE3__) || !defined(__clang__) || !defined(__x86_64__)
+ #error "We're starting with just SSSE3 x86-64 for now, and will always require Clang."
+#endif
+
+#define WRAP(name) sk_##name##_ssse3_lowp
+
+using K = const SkJumper_constants;
+static const size_t kStride = 8;
+
+template <typename T> using V = T __attribute__((ext_vector_type(8)));
+using U8 = V<uint8_t>;
+using U16 = V<uint16_t>;
+using U32 = V<uint32_t>;
+
+// See SkFixed15.h for details on this format and its operations.
+struct F {
+ U16 vec;
+
+ F() = default;
+ F(uint16_t bits) : vec(bits) {}
+
+ F(U16 v) : vec(v) {}
+ operator U16() const { return vec; }
+};
+
+SI F operator+(F x, F y) { return x.vec + y.vec; }
+SI F operator-(F x, F y) { return x.vec - y.vec; }
+SI F operator*(F x, F y) { return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec)); }
+SI F mad(F f, F m, F a) { return f*m+a; }
+
+SI F operator<<(F x, int bits) { return x.vec << bits; }
+SI F operator>>(F x, int bits) { return x.vec >> bits; }
+
+using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F);
+
+MAYBE_MSABI
+extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t limit, void** program, K* k) {
+ F v{};
+ auto start = (Stage*)load_and_inc(program);
+ while (x + kStride <= limit) {
+ start(k,program,x,y,0, v,v,v,v, v,v,v,v);
+ x += kStride;
+ }
+ if (size_t tail = limit - x) {
+ start(k,program,x,y,tail, v,v,v,v, v,v,v,v);
+ }
+}
+extern "C" void WRAP(just_return)(K*, void**, size_t,size_t,size_t, F,F,F,F, F,F,F,F) {}
+
+#define STAGE(name) \
+ SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail, \
+ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
+ extern "C" void WRAP(name)(K* k, void** program, size_t x, size_t y, size_t tail, \
+ F r, F g, F b, F a, F dr, F dg, F db, F da) { \
+ LazyCtx ctx(program); \
+ name##_k(k,ctx,x,y,tail, r,g,b,a, dr,dg,db,da); \
+ auto next = (Stage*)load_and_inc(program); \
+ next(k,program,x,y,tail, r,g,b,a, dr,dg,db,da); \
+ } \
+ SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail, \
+ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+
+
+// Helper functions used by multiple stages.
+
+template <typename V, typename T>
+SI V load(const T* src, size_t tail) {
+#if defined(JUMPER)
+ __builtin_assume(tail < kStride);
+ if (__builtin_expect(tail, 0)) {
+ V v{}; // Any inactive lanes are zeroed.
+ switch (tail-1) {
+ case 6: v[6] = src[6];
+ case 5: v[5] = src[5];
+ case 4: v[4] = src[4];
+ case 3: v[3] = src[3];
+ case 2: v[2] = src[2];
+ case 1: v[1] = src[1];
+ case 0: v[0] = src[0];
+ }
+ return v;
+ }
+#endif
+ return unaligned_load<V>(src);
+}
+
+template <typename V, typename T>
+SI void store(T* dst, V v, size_t tail) {
+#if defined(JUMPER)
+ __builtin_assume(tail < kStride);
+ if (__builtin_expect(tail, 0)) {
+ switch (tail-1) {
+ case 6: dst[6] = v[6];
+ case 5: dst[5] = v[5];
+ case 4: dst[4] = v[4];
+ case 3: dst[3] = v[3];
+ case 2: dst[2] = v[2];
+ case 1: dst[1] = v[1];
+ case 0: dst[0] = v[0];
+ }
+ return;
+ }
+#endif
+ unaligned_store(dst, v);
+}
+
+SI void from_8888(U32 rgba, F* r, F* g, F* b, F* a) {
+ // Split the 8 pixels into low and high halves, and reinterpret as vectors of 16-bit values.
+ U16 lo = unaligned_load<U16>((const uint32_t*)&rgba + 0),
+ hi = unaligned_load<U16>((const uint32_t*)&rgba + 4);
+
+ U16 _0415 = _mm_unpacklo_epi8(lo, hi), // r0 r4 g0 g4 b0 b4 a0 a4 r1 r5 g1 g5 b1 b5 a1 a5
+ _2637 = _mm_unpackhi_epi8(lo, hi);
+
+ U16 even = _mm_unpacklo_epi8(_0415, _2637), // r0 r2 r4 r6 g0 g2 g4 g6 b0 b2 b4 b6 a0 a2 a4 a6
+ odd = _mm_unpackhi_epi8(_0415, _2637);
+
+ U16 rg = _mm_unpacklo_epi8(even, odd), // r0 r1 r2 r3 r4 r5 r6 r7 g0 g1 g2 g3 g4 g5 g6 g7
+ ba = _mm_unpackhi_epi8(even, odd);
+
+ // Unpack as 16-bit values into the high half of each 16-bit lane, to get a free *256.
+ U16 R = _mm_unpacklo_epi8(U16(0), rg),
+ G = _mm_unpackhi_epi8(U16(0), rg),
+ B = _mm_unpacklo_epi8(U16(0), ba),
+ A = _mm_unpackhi_epi8(U16(0), ba);
+
+ // Now we scale from [0,255] to [0,32768]. Ideally that's 32768/255 = 128.50196,
+ // but we can approximate that very cheaply as 256*32897/65536 = 128.50391.
+ // 0 and 255 map to 0 and 32768 correctly, and nothing else is off by more than 1.
+ *r = _mm_mulhi_epu16(R, U16(32897));
+ *g = _mm_mulhi_epu16(G, U16(32897));
+ *b = _mm_mulhi_epu16(B, U16(32897));
+ *a = _mm_mulhi_epu16(A, U16(32897));
+}
+SI U32 to_8888(F r, F g, F b, F a) {
+ // We want to interlace and pack these values from [0,32768] to [0,255].
+ // Luckily the simplest possible thing works great: >>7, then saturate.
+ // The 'u' in packus handles the saturation to [0,255] we need.
+ U16 rb = _mm_packus_epi16(r>>7,b>>7), // r0 r1 r2 r3 r4 r5 r6 r7 b0 b1 b2 b3 b4 b5 b6 b7
+ ga = _mm_packus_epi16(g>>7,a>>7);
+
+ U16 rg = _mm_unpacklo_epi8(rb, ga), // r0 g0 r1 g1 ... r7 g7
+ ba = _mm_unpackhi_epi8(rb, ga); // b0 a0 ... b7 a7
+
+ U16 lo = _mm_unpacklo_epi16(rg, ba), // r0 g0 b0 a0 ... r3 g3 b3 a3
+ hi = _mm_unpackhi_epi16(rg, ba); // r4 g4 b4 a4 ... r7 g7 b7 a7
+
+ U32 px;
+ memcpy((uint32_t*)&px + 0, &lo, sizeof(lo));
+ memcpy((uint32_t*)&px + 4, &hi, sizeof(hi));
+ return px;
+}
+
+// Stages!
+
+STAGE(load_8888) {
+ auto ptr = *(const uint32_t**)ctx + x;
+ from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
+}
+STAGE(store_8888) {
+ auto ptr = *(uint32_t**)ctx + x;
+ store(ptr, to_8888(r,g,b,a), tail);
+}
+
+STAGE(swap_rb) {
+ auto tmp = r;
+ r = b;
+ b = tmp;
+}
diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py
index 301295fa68..971087b3d5 100755
--- a/src/jumper/build_stages.py
+++ b/src/jumper/build_stages.py
@@ -29,6 +29,14 @@ subprocess.check_call(clang + cflags + sse2 + win +
['-c', 'src/jumper/SkJumper_stages.cpp'] +
['-o', 'win_sse2.o'])
+ssse3 = ['-mssse3', '-mno-sse4.1']
+subprocess.check_call(clang + cflags + ssse3 +
+ ['-c', 'src/jumper/SkJumper_stages_lowp.cpp'] +
+ ['-o', 'lowp_ssse3.o'])
+subprocess.check_call(clang + cflags + ssse3 + win +
+ ['-c', 'src/jumper/SkJumper_stages_lowp.cpp'] +
+ ['-o', 'win_lowp_ssse3.o'])
+
sse41 = ['-msse4.1']
subprocess.check_call(clang + cflags + sse41 +
['-c', 'src/jumper/SkJumper_stages.cpp'] +
@@ -187,6 +195,8 @@ print 'BALIGN32'
parse_object_file('sse41.o', '.byte')
print 'BALIGN32'
parse_object_file('sse2.o', '.byte')
+print 'BALIGN32'
+parse_object_file('lowp_ssse3.o', '.byte')
print '#endif'
@@ -210,5 +220,7 @@ print 'ALIGN 32'
parse_object_file('win_sse41.o', 'DB')
print 'ALIGN 32'
parse_object_file('win_sse2.o', 'DB')
+print 'ALIGN 32'
+parse_object_file('win_lowp_ssse3.o', 'DB')
print 'ENDIF'
print 'END'
diff --git a/tests/SkRasterPipelineTest.cpp b/tests/SkRasterPipelineTest.cpp
index f65a2c9868..014683c9b9 100644
--- a/tests/SkRasterPipelineTest.cpp
+++ b/tests/SkRasterPipelineTest.cpp
@@ -205,3 +205,31 @@ DEF_TEST(SkRasterPipeline_tail, r) {
}
}
}
+
+DEF_TEST(SkRasterPipeline_lowp, r) {
+ uint32_t rgba[64];
+ for (int i = 0; i < 64; i++) {
+ rgba[i] = (4*i+0) << 0
+ | (4*i+1) << 8
+ | (4*i+2) << 16
+ | (4*i+3) << 24;
+ }
+
+ void* ptr = rgba;
+
+ SkRasterPipeline_<256> p;
+ p.append(SkRasterPipeline::load_8888, &ptr);
+ p.append(SkRasterPipeline::swap_rb);
+ p.append(SkRasterPipeline::store_8888, &ptr);
+ p.run(0,0,64);
+
+ for (int i = 0; i < 64; i++) {
+ uint32_t want = (4*i+0) << 16
+ | (4*i+1) << 8
+ | (4*i+2) << 0
+ | (4*i+3) << 24;
+ if (rgba[i] != want) {
+ ERRORF(r, "got %08x, want %08x\n", rgba[i], want);
+ }
+ }
+}