diff options
author | mtklein <mtklein@chromium.org> | 2016-07-12 15:01:26 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2016-07-12 15:01:26 -0700 |
commit | 281b33fdd909ee3f43192cdf950ce00e3df62407 (patch) | |
tree | 807d54decaa74522b811d17740f3e6e959a519de | |
parent | 7438bfc0804d021aa92cdd5ea644994a4248f3db (diff) |
SkRasterPipeline preliminaries
Re-uploading to see if I can get a CL number < 2^31.
patch from issue 2147533002 at patchset 240001 (http://crrev.com/2147533002#ps240001)
Already reviewed at the other crrev link.
TBR=
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2147533002
CQ_INCLUDE_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review-Url: https://codereview.chromium.org/2144573004
-rw-r--r-- | bench/SkRasterPipelineBench.cpp | 195 | ||||
-rw-r--r-- | gyp/core.gypi | 1 | ||||
-rw-r--r-- | src/core/SkRasterPipeline.cpp | 65 | ||||
-rw-r--r-- | src/core/SkRasterPipeline.h | 104 | ||||
-rw-r--r-- | src/opts/SkNx_neon.h | 1 | ||||
-rw-r--r-- | src/opts/SkNx_sse.h | 1 | ||||
-rw-r--r-- | tests/SkRasterPipelineTest.cpp | 87 |
7 files changed, 454 insertions, 0 deletions
diff --git a/bench/SkRasterPipelineBench.cpp b/bench/SkRasterPipelineBench.cpp new file mode 100644 index 0000000000..b3b87982af --- /dev/null +++ b/bench/SkRasterPipelineBench.cpp @@ -0,0 +1,195 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "Benchmark.h" +#include "SkRasterPipeline.h" +#include "SkSRGB.h" + +static const int N = 1023; + +static uint32_t dst[N], + src[N]; +static uint8_t mask[N]; + +// We'll build up a somewhat realistic useful pipeline: +// - load srgb src +// - scale src by 8-bit mask +// - load srgb dst +// - src = srcover(dst, src) +// - store src back as srgb +// Every stage except for srcover interacts with memory, and so will need _tail variants. + +static void SK_VECTORCALL load_s_srgb(SkRasterPipeline::Stage* st, size_t x, + Sk4f r, Sk4f g, Sk4f b, Sk4f a, + Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { + auto ptr = st->ctx<const uint32_t*>() + x; + + r = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff], + sk_linear_from_srgb[(ptr[1] >> 0) & 0xff], + sk_linear_from_srgb[(ptr[2] >> 0) & 0xff], + sk_linear_from_srgb[(ptr[3] >> 0) & 0xff] }; + + g = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 8) & 0xff], + sk_linear_from_srgb[(ptr[1] >> 8) & 0xff], + sk_linear_from_srgb[(ptr[2] >> 8) & 0xff], + sk_linear_from_srgb[(ptr[3] >> 8) & 0xff] }; + + b = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 16) & 0xff], + sk_linear_from_srgb[(ptr[1] >> 16) & 0xff], + sk_linear_from_srgb[(ptr[2] >> 16) & 0xff], + sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] }; + + a = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f); + + st->next(x, r,g,b,a, dr,dg,db,da); +} + +static void SK_VECTORCALL load_s_srgb_tail(SkRasterPipeline::Stage* st, size_t x, + Sk4f r, Sk4f g, Sk4f b, Sk4f a, + Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { + auto ptr = st->ctx<const uint32_t*>() + x; + + r = Sk4f{ sk_linear_from_srgb[(*ptr >> 0) & 0xff], 0,0,0 }; + g = Sk4f{ sk_linear_from_srgb[(*ptr >> 8) & 0xff], 0,0,0 }; + b = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 }; + a = Sk4f{ (*ptr >> 24) * (1/255.0f), 0,0,0 }; + + st->next(x, r,g,b,a, dr,dg,db,da); +} + +static void SK_VECTORCALL load_d_srgb(SkRasterPipeline::Stage* st, size_t x, + Sk4f r, Sk4f g, Sk4f b, Sk4f a, + Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { + auto ptr = st->ctx<const uint32_t*>() + x; + + dr = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 0) & 0xff], + sk_linear_from_srgb[(ptr[1] >> 0) & 0xff], + sk_linear_from_srgb[(ptr[2] >> 0) & 0xff], + sk_linear_from_srgb[(ptr[3] >> 0) & 0xff] }; + + dg = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 8) & 0xff], + sk_linear_from_srgb[(ptr[1] >> 8) & 0xff], + sk_linear_from_srgb[(ptr[2] >> 8) & 0xff], + sk_linear_from_srgb[(ptr[3] >> 8) & 0xff] }; + + db = Sk4f{ sk_linear_from_srgb[(ptr[0] >> 16) & 0xff], + sk_linear_from_srgb[(ptr[1] >> 16) & 0xff], + sk_linear_from_srgb[(ptr[2] >> 16) & 0xff], + sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] }; + + da = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f); + + st->next(x, r,g,b,a, dr,dg,db,da); +} + +static void SK_VECTORCALL load_d_srgb_tail(SkRasterPipeline::Stage* st, size_t x, + Sk4f r, Sk4f g, Sk4f b, Sk4f a, + Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { + auto ptr = st->ctx<const uint32_t*>() + x; + + dr = Sk4f{ sk_linear_from_srgb[(*ptr >> 0) & 0xff], 0,0,0 }; + dg = Sk4f{ sk_linear_from_srgb[(*ptr >> 8) & 0xff], 0,0,0 }; + db = Sk4f{ sk_linear_from_srgb[(*ptr >> 16) & 0xff], 0,0,0 }; + da = Sk4f{ (*ptr >> 24) * (1/255.0f), 0,0,0 }; + + st->next(x, r,g,b,a, dr,dg,db,da); +} + +static void SK_VECTORCALL scale_u8(SkRasterPipeline::Stage* st, size_t x, + Sk4f r, Sk4f g, Sk4f b, Sk4f a, + Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { + auto ptr = st->ctx<const uint8_t*>() + x; + + auto c = SkNx_cast<float>(Sk4b::Load(ptr)) * (1/255.0f); + r *= c; + g *= c; + b *= c; + a *= c; + + st->next(x, r,g,b,a, dr,dg,db,da); +} + +static void SK_VECTORCALL scale_u8_tail(SkRasterPipeline::Stage* st, size_t x, + Sk4f r, Sk4f g, Sk4f b, Sk4f a, + Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { + auto ptr = st->ctx<const uint8_t*>() + x; + + auto c = *ptr * (1/255.0f); + r *= c; + g *= c; + b *= c; + a *= c; + + st->next(x, r,g,b,a, dr,dg,db,da); +} + +static void SK_VECTORCALL srcover(SkRasterPipeline::Stage* st, size_t x, + Sk4f r, Sk4f g, Sk4f b, Sk4f a, + Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { + auto A = 1.0f - a; + r += dr * A; + g += dg * A; + b += db * A; + a += da * A; + + st->next(x, r,g,b,a, dr,dg,db,da); +} + +static Sk4f clamp(const Sk4f& x) { + return Sk4f::Min(Sk4f::Max(x, 0.0f), 255.0f); +} + +static void SK_VECTORCALL store_srgb(SkRasterPipeline::Stage* st, size_t x, + Sk4f r, Sk4f g, Sk4f b, Sk4f a, + Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { + auto ptr = st->ctx<uint32_t*>() + x; + + r = clamp(sk_linear_to_srgb(r)); + g = clamp(sk_linear_to_srgb(g)); + b = clamp(sk_linear_to_srgb(b)); + a = clamp( 255.0f * a ); + + ( SkNx_cast<int>(r) + | SkNx_cast<int>(g) << 8 + | SkNx_cast<int>(b) << 16 + | SkNx_cast<int>(a) << 24 ).store(ptr); +} + +static void SK_VECTORCALL store_srgb_tail(SkRasterPipeline::Stage* st, size_t x, + Sk4f r, Sk4f g, Sk4f b, Sk4f a, + Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) { + auto ptr = st->ctx<uint32_t*>() + x; + + auto rgba = sk_linear_to_srgb({r[0], g[0], b[0], 0}); + rgba = {rgba[0], rgba[1], rgba[2], 255.0f*a[0]}; + rgba = clamp(rgba); + + SkNx_cast<uint8_t>(rgba).store(ptr); +} + +class SkRasterPipelineBench : public Benchmark { +public: + SkRasterPipelineBench() {} + + bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; } + const char* onGetName() override { return "SkRasterPipelineBench"; } + + void onDraw(int loops, SkCanvas*) override { + SkRasterPipeline p; + p.append(load_s_srgb, load_s_srgb_tail, src); + p.append( scale_u8, scale_u8_tail, mask); + p.append(load_d_srgb, load_d_srgb_tail, dst); + p.append(srcover); + p.append( store_srgb, store_srgb_tail, dst); + + while (loops --> 0) { + p.run(N); + } + } +}; + +DEF_BENCH( return new SkRasterPipelineBench; ) diff --git a/gyp/core.gypi b/gyp/core.gypi index 97e38c2cc6..f49446321a 100644 --- a/gyp/core.gypi +++ b/gyp/core.gypi @@ -231,6 +231,7 @@ '<(skia_src_path)/core/SkQuadClipper.cpp', '<(skia_src_path)/core/SkQuadClipper.h', '<(skia_src_path)/core/SkRasterClip.cpp', + '<(skia_src_path)/core/SkRasterPipeline.cpp', '<(skia_src_path)/core/SkRasterizer.cpp', '<(skia_src_path)/core/SkReadBuffer.h', '<(skia_src_path)/core/SkReadBuffer.cpp', diff --git a/src/core/SkRasterPipeline.cpp b/src/core/SkRasterPipeline.cpp new file mode 100644 index 0000000000..6a8f10975a --- /dev/null +++ b/src/core/SkRasterPipeline.cpp @@ -0,0 +1,65 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "SkRasterPipeline.h" + +SkRasterPipeline::SkRasterPipeline() {} + +void SkRasterPipeline::append(SkRasterPipeline::Fn body, const void* body_ctx, + SkRasterPipeline::Fn tail, const void* tail_ctx) { + // We can't add more stages after being rewired to run(). + SkASSERT(!fReadyToRun); + + // For now, just stash the stage's function in its own fNext slot. + // We'll rewire our stages before running the pipeline so fNext makes sense. + fBody.push_back({ body, const_cast<void*>(body_ctx) }); + fTail.push_back({ tail, const_cast<void*>(tail_ctx) }); +} + +void SkRasterPipeline::run(size_t n) { + if (fBody.empty() || fTail.empty()) { + return; + } + + if (!fReadyToRun) { + auto rewire = [](Stages* stages) { + SkASSERT(!stages->empty()); + + // Rotate the fNext pointers so they point to the next function to + // call, not function we're currently calling as set by append(). + auto start = stages->front().fNext; + for (int i = 0; i < stages->count() - 1; i++) { + (*stages)[i].fNext = (*stages)[i+1].fNext; + } + stages->back().fNext = start; // This is a pretty handy place to stash this. + }; + rewire(&fBody); + rewire(&fTail); + fReadyToRun = true; + } + + // It's fastest to start uninitialized if the compilers all let us. If not, next fastest is 0. + Sk4f v; + + auto start_body = fBody.back().fNext, // See rewire(). + start_tail = fTail.back().fNext; + + auto body = fBody.begin(), + tail = fTail.begin(); + + size_t x = 0; + while (n >= 4) { + start_body(body, x, v,v,v,v, v,v,v,v); + x += 4; + n -= 4; + } + while (n > 0) { + start_tail(tail, x, v,v,v,v, v,v,v,v); + x += 1; + n -= 1; + } +} diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h new file mode 100644 index 0000000000..8ae7bb1f2e --- /dev/null +++ b/src/core/SkRasterPipeline.h @@ -0,0 +1,104 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef SkRasterPipeline_DEFINED +#define SkRasterPipeline_DEFINED + +#include "SkNx.h" +#include "SkTArray.h" +#include "SkTypes.h" + +/** + * SkRasterPipeline provides a cheap way to chain together a pixel processing pipeline. + * + * It's particularly designed for situations where the potential pipeline is extremely + * combinatoric: {N dst formats} x {M source formats} x {K mask formats} x {C transfer modes} ... + * No one wants to write specialized routines for all those combinations, and if we did, we'd + * end up bloating our code size dramatically. SkRasterPipeline stages can be chained together + * at runtime, so we can scale this problem linearly rather than combinatorically. + * + * Each stage is represented by a function conforming to a common interface, SkRasterPipeline::Fn, + * and by an arbitrary context pointer. Fn's arguments, and sometimes custom calling convention, + * are designed to maximize the amount of data we can pass along the pipeline cheaply. + * On many machines all arguments stay in registers the entire time. + * + * The meaning of the arguments to Fn are sometimes fixed... + * - The Stage* always represents the current stage, mainly providing access to ctx(). + * - The size_t is always the destination x coordinate. If you need y, put it in your context. + * - By the time the shader's done, the first four vectors should hold source red, + * green, blue, and alpha, up to 4 pixels' worth each. + * + * ...and sometimes flexible: + * - In the shader, the first four vectors can be used for anything, e.g. sample coordinates. + * - The last four vectors are scratch registers that can be used to communicate between + * stages; transfer modes use these to hold the original destination pixel components. + * + * On some platforms the last four vectors are slower to work with than the other arguments. + * + * When done mutating its arguments and/or context, a stage can either: + * 1) call st->next() with its mutated arguments, chaining to the next stage of the pipeline; or + * 2) return, indicating the pipeline is complete for these pixels. + * + * Some obvious stages that typically return are those that write a color to a destination pointer, + * but any stage can short-circuit the rest of the pipeline by returning instead of calling next(). + */ + +class SkRasterPipeline { +public: + struct Stage; + using Fn = void(SK_VECTORCALL *)(Stage*, size_t, Sk4f,Sk4f,Sk4f,Sk4f, + Sk4f,Sk4f,Sk4f,Sk4f); + struct Stage { + template <typename T> + T ctx() { return static_cast<T>(fCtx); } + + void SK_VECTORCALL next(size_t x, Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3, + Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) { + // Stages are logically a pipeline, and physically are contiguous in an array. + // To get to the next stage, we just increment our pointer to the next array element. + fNext(this+1, x, v0,v1,v2,v3, v4,v5,v6,v7); + } + + // It makes next() a good bit cheaper if we hold the next function to call here, + // rather than logically simpler choice of the function implementing this stage. + Fn fNext; + void* fCtx; + }; + + + SkRasterPipeline(); + + // Run the pipeline constructed with append(), walking x through [0,n), + // generally in 4 pixel steps, but sometimes 1 pixel at a time. + void run(size_t n); + + // Use this append() if your stage is sensitive to the number of pixels you're working with: + // - body will always be called for a full 4 pixels + // - tail will always be called for a single pixel + // Typically this is only an essential distintion for stages that read or write memory. + void append(Fn body, const void* body_ctx, + Fn tail, const void* tail_ctx); + + // Most stages don't actually care if they're working on 4 or 1 pixel. + void append(Fn fn, const void* ctx = nullptr) { + this->append(fn, ctx, fn, ctx); + } + + // Most 4 pixel or 1 pixel variants share the same context pointer. + void append(Fn body, Fn tail, const void* ctx = nullptr) { + this->append(body, ctx, tail, ctx); + } + +private: + using Stages = SkSTArray<10, Stage, /*MEM_COPY=*/true>; + + Stages fBody, + fTail; + bool fReadyToRun = false; +}; + +#endif//SkRasterPipeline_DEFINED diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h index 6d4cade12e..91cd104482 100644 --- a/src/opts/SkNx_neon.h +++ b/src/opts/SkNx_neon.h @@ -386,6 +386,7 @@ public: SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); } SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); } + SkNx operator & (const SkNx& o) const { return vandq_s32(fVec, o.fVec); } SkNx operator | (const SkNx& o) const { return vorrq_s32(fVec, o.fVec); } SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); } diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h index 8952ff77f7..78cea3b3d8 100644 --- a/src/opts/SkNx_sse.h +++ b/src/opts/SkNx_sse.h @@ -150,6 +150,7 @@ public: _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0))); } + SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); } SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } diff --git a/tests/SkRasterPipelineTest.cpp b/tests/SkRasterPipelineTest.cpp new file mode 100644 index 0000000000..1db0206705 --- /dev/null +++ b/tests/SkRasterPipelineTest.cpp @@ -0,0 +1,87 @@ +/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#include "Test.h" +#include "SkRasterPipeline.h" + +// load needs two variants, one to load 4 values... +static void SK_VECTORCALL load(SkRasterPipeline::Stage* st, size_t x, + Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3, + Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) { + auto ptr = st->ctx<const float*>(); + v0 = Sk4f{ptr[x+0]}; + v1 = Sk4f{ptr[x+1]}; + v2 = Sk4f{ptr[x+2]}; + v3 = Sk4f{ptr[x+3]}; + + st->next(x, v0,v1,v2,v3, v4,v5,v6,v7); +} + +// ...and one to load a single value. +static void SK_VECTORCALL load_tail(SkRasterPipeline::Stage* st, size_t x, + Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3, + Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) { + auto ptr = st->ctx<const float*>(); + v0 = Sk4f{ptr[x]}; + + st->next(x, v0,v1,v2,v3, v4,v5,v6,v7); +} + +// square doesn't really care how many of its inputs are active, nor does it need a context. +static void SK_VECTORCALL square(SkRasterPipeline::Stage* st, size_t x, + Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3, + Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) { + v0 *= v0; + v1 *= v1; + v2 *= v2; + v3 *= v3; + st->next(x, v0,v1,v2,v3, v4,v5,v6,v7); +} + +// Like load, store has a _tail variant. It ends the pipeline by returning. +static void SK_VECTORCALL store(SkRasterPipeline::Stage* st, size_t x, + Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3, + Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) { + auto ptr = st->ctx<float*>(); + ptr[x+0] = v0[0]; + ptr[x+1] = v1[0]; + ptr[x+2] = v2[0]; + ptr[x+3] = v3[0]; +} + +static void SK_VECTORCALL store_tail(SkRasterPipeline::Stage* st, size_t x, + Sk4f v0, Sk4f v1, Sk4f v2, Sk4f v3, + Sk4f v4, Sk4f v5, Sk4f v6, Sk4f v7) { + auto ptr = st->ctx<float*>(); + ptr[x+0] = v0[0]; +} + +DEF_TEST(SkRasterPipeline, r) { + // We'll build up and run a simple pipeline that exercises the salient + // mechanics of SkRasterPipeline: + // - context pointers + // - stages sensitive to the number of pixels + // - stages insensitive to the number of pixels + // + // This pipeline loads up some values, squares them, then writes them back to memory. + + const float src_vals[] = { 1,2,3,4,5 }; + float dst_vals[] = { 0,0,0,0,0 }; + + SkRasterPipeline p; + p.append(load, load_tail, src_vals); + p.append(square); + p.append(store, store_tail, dst_vals); + + p.run(5); + + REPORTER_ASSERT(r, dst_vals[0] == 1); + REPORTER_ASSERT(r, dst_vals[1] == 4); + REPORTER_ASSERT(r, dst_vals[2] == 9); + REPORTER_ASSERT(r, dst_vals[3] == 16); + REPORTER_ASSERT(r, dst_vals[4] == 25); +} |