aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-01-04 11:04:01 -0500
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-01-04 20:58:21 +0000
commit464e6a1b5a0598eb1805fa6c11f515466346704f (patch)
tree840901b5324c6c28397acc39ec93094cc7d0ebb5 /src
parent07792b218e1cf31c42611276d597fcc99677d391 (diff)
Retry trim another instruction off SkRasterPipeline overhead
This time, with manual program memory management instead of std::vector<void*>. Using STL types from SkOpts_hsw.cpp is not safe. Things like std::vector<void*> are inlined but not anonymous, so they're deduped by the linker arbitrarily. This is bad when we pick the version compiled with AVX instructions on a machine that doesn't support AVX... std::vector<Stage> was safe before because Stage itself was anonymous. While not anonymous, std::vector<Stage> is unique to the compilation unit, because you can only refer to the anonymous Stage in the compilation unit. CQ_INCLUDE_TRYBOTS=master.tryserver.chromium.linux:linux_chromium_asan_rel_ng;skia.primary:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD Change-Id: I015e27583b6b6ff06b5e9f63e3f40ee6b27d6dbd Reviewed-on: https://skia-review.googlesource.com/6550 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src')
-rw-r--r--src/opts/SkOpts_hsw.cpp8
-rw-r--r--src/opts/SkRasterPipeline_opts.h405
2 files changed, 232 insertions, 181 deletions
diff --git a/src/opts/SkOpts_hsw.cpp b/src/opts/SkOpts_hsw.cpp
index d0845c58f7..bb4e4d1f47 100644
--- a/src/opts/SkOpts_hsw.cpp
+++ b/src/opts/SkOpts_hsw.cpp
@@ -6,8 +6,14 @@
*/
#include "SkSafe_math.h" // Keep this first.
-#include "SkOpts.h"
+// Please note carefully.
+// It is not safe for _opts.h files included here to use STL types, for the
+// same reason we just had to include SkSafe_math.h: STL types are templated,
+// defined in headers, but not in anonymous namespaces. It's very easy to
+// cause ODR violations with these types and AVX+ code generation.
+
+#include "SkOpts.h"
#define SK_OPTS_NS hsw
#include "SkBitmapFilter_opts.h"
#include "SkRasterPipeline_opts.h"
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index c0d6599523..3b877ce156 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -19,7 +19,6 @@
#include "SkPM4fPriv.h"
#include "SkRasterPipeline.h"
#include "SkSRGB.h"
-#include <utility>
namespace {
@@ -35,64 +34,108 @@ namespace {
using SkNh = SkNx<N, uint16_t>;
using SkNb = SkNx<N, uint8_t>;
- struct Stage;
- using Fn = void(SK_VECTORCALL *)(Stage*, size_t x_tail, SkNf,SkNf,SkNf,SkNf,
- SkNf,SkNf,SkNf,SkNf);
- struct Stage { Fn next; void* ctx; };
-
+ using Fn = void(SK_VECTORCALL *)(size_t x_tail, void** p, SkNf,SkNf,SkNf,SkNf,
+ SkNf,SkNf,SkNf,SkNf);
// x_tail encodes two values x and tail as x*N+tail, where 0 <= tail < N.
// x is the induction variable we're walking along, incrementing by N each step.
// tail == 0 means work with a full N pixels; otherwise use only the low tail pixels.
+ //
+ // p is our program, a sequence of Fn to call interlaced with any void* context pointers. E.g.
+ // &load_8888
+ // (src ptr)
+ // &from_srgb
+ // &load_f16_d
+ // (dst ptr)
+ // &srcover
+ // &store_f16
+ // (dst ptr)
+ // &just_return
} // namespace
#define SI static inline
+// Basically, return *(*ptr)++, maybe faster than the compiler can do it.
+SI void* load_and_increment(void*** ptr) {
+ // We do this often enough that it's worth hyper-optimizing.
+ // x86 can do this in one instruction if ptr is in rsi.
+ // (This is why p is the second argument to Fn: it's passed in rsi.)
+#if defined(__GNUC__) && defined(__x86_64__)
+ void* rax;
+ __asm__("lodsq" : "=a"(rax), "+S"(*ptr));
+ return rax;
+#else
+ return *(*ptr)++;
+#endif
+}
+
// Stages are logically a pipeline, and physically are contiguous in an array.
// To get to the next stage, we just increment our pointer to the next array element.
-SI void SK_VECTORCALL next(Stage* st, size_t x_tail, SkNf r, SkNf g, SkNf b, SkNf a,
- SkNf dr, SkNf dg, SkNf db, SkNf da) {
- st->next(st+1, x_tail, r,g,b,a, dr,dg,db,da);
+SI void SK_VECTORCALL next(size_t x_tail, void** p, SkNf r, SkNf g, SkNf b, SkNf a,
+ SkNf dr, SkNf dg, SkNf db, SkNf da) {
+ auto next = (Fn)load_and_increment(&p);
+ next(x_tail,p, r,g,b,a, dr,dg,db,da);
}
// Stages defined below always call next.
// This is always the last stage, a backstop that actually returns to the caller when done.
-SI void SK_VECTORCALL just_return(Stage*, size_t, SkNf, SkNf, SkNf, SkNf,
+SI void SK_VECTORCALL just_return(size_t, void**, SkNf, SkNf, SkNf, SkNf,
SkNf, SkNf, SkNf, SkNf) {}
#define STAGE(name) \
- static SK_ALWAYS_INLINE void name##_kernel(void* ctx, size_t x, size_t tail, \
+ static SK_ALWAYS_INLINE void name##_kernel(size_t x, size_t tail, \
SkNf& r, SkNf& g, SkNf& b, SkNf& a, \
SkNf& dr, SkNf& dg, SkNf& db, SkNf& da); \
- SI void SK_VECTORCALL name(Stage* st, size_t x_tail, \
+ SI void SK_VECTORCALL name(size_t x_tail, void** p, \
SkNf r, SkNf g, SkNf b, SkNf a, \
SkNf dr, SkNf dg, SkNf db, SkNf da) { \
- name##_kernel(st->ctx, x_tail/N, x_tail%N, r,g,b,a, dr,dg,db,da); \
- next(st, x_tail, r,g,b,a, dr,dg,db,da); \
+ name##_kernel(x_tail/N, x_tail%N, r,g,b,a, dr,dg,db,da); \
+ next(x_tail,p, r,g,b,a, dr,dg,db,da); \
} \
- SI void SK_VECTORCALL name##_d(Stage* st, size_t x_tail, \
+ SI void SK_VECTORCALL name##_d(size_t x_tail, void** p, \
SkNf r, SkNf g, SkNf b, SkNf a, \
SkNf dr, SkNf dg, SkNf db, SkNf da) { \
- name##_kernel(st->ctx, x_tail/N, x_tail%N, dr,dg,db,da, r,g,b,a); \
- next(st, x_tail, r,g,b,a, dr,dg,db,da); \
+ name##_kernel(x_tail/N, x_tail%N, dr,dg,db,da, r,g,b,a); \
+ next(x_tail,p, r,g,b,a, dr,dg,db,da); \
} \
- static SK_ALWAYS_INLINE void name##_kernel(void* ctx, size_t x, size_t tail, \
+ static SK_ALWAYS_INLINE void name##_kernel(size_t x, size_t tail, \
SkNf& r, SkNf& g, SkNf& b, SkNf& a, \
SkNf& dr, SkNf& dg, SkNf& db, SkNf& da)
+#define STAGE_CTX(name, Ctx) \
+ static SK_ALWAYS_INLINE void name##_kernel(Ctx ctx, size_t x, size_t tail, \
+ SkNf& r, SkNf& g, SkNf& b, SkNf& a, \
+ SkNf& dr, SkNf& dg, SkNf& db, SkNf& da); \
+ SI void SK_VECTORCALL name(size_t x_tail, void** p, \
+ SkNf r, SkNf g, SkNf b, SkNf a, \
+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \
+ auto ctx = (Ctx)load_and_increment(&p); \
+ name##_kernel(ctx, x_tail/N, x_tail%N, r,g,b,a, dr,dg,db,da); \
+ next(x_tail,p, r,g,b,a, dr,dg,db,da); \
+ } \
+ SI void SK_VECTORCALL name##_d(size_t x_tail, void** p, \
+ SkNf r, SkNf g, SkNf b, SkNf a, \
+ SkNf dr, SkNf dg, SkNf db, SkNf da) { \
+ auto ctx = (Ctx)load_and_increment(&p); \
+ name##_kernel(ctx, x_tail/N, x_tail%N, dr,dg,db,da, r,g,b,a); \
+ next(x_tail,p, r,g,b,a, dr,dg,db,da); \
+ } \
+ static SK_ALWAYS_INLINE void name##_kernel(Ctx ctx, size_t x, size_t tail, \
+ SkNf& r, SkNf& g, SkNf& b, SkNf& a, \
+ SkNf& dr, SkNf& dg, SkNf& db, SkNf& da)
// Many xfermodes apply the same logic to each channel.
#define RGBA_XFERMODE(name) \
static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \
const SkNf& d, const SkNf& da); \
- SI void SK_VECTORCALL name(Stage* st, size_t x_tail, \
+ SI void SK_VECTORCALL name(size_t x_tail, void** p, \
SkNf r, SkNf g, SkNf b, SkNf a, \
SkNf dr, SkNf dg, SkNf db, SkNf da) { \
r = name##_kernel(r,a,dr,da); \
g = name##_kernel(g,a,dg,da); \
b = name##_kernel(b,a,db,da); \
a = name##_kernel(a,a,da,da); \
- next(st, x_tail, r,g,b,a, dr,dg,db,da); \
+ next(x_tail,p, r,g,b,a, dr,dg,db,da); \
} \
static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \
const SkNf& d, const SkNf& da)
@@ -101,14 +144,14 @@ SI void SK_VECTORCALL just_return(Stage*, size_t, SkNf, SkNf, SkNf, SkNf,
#define RGB_XFERMODE(name) \
static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \
const SkNf& d, const SkNf& da); \
- SI void SK_VECTORCALL name(Stage* st, size_t x_tail, \
+ SI void SK_VECTORCALL name(size_t x_tail, void** p, \
SkNf r, SkNf g, SkNf b, SkNf a, \
SkNf dr, SkNf dg, SkNf db, SkNf da) { \
r = name##_kernel(r,a,dr,da); \
g = name##_kernel(g,a,dg,da); \
b = name##_kernel(b,a,db,da); \
a = a + (da * (1.0f-a)); \
- next(st, x_tail, r,g,b,a, dr,dg,db,da); \
+ next(x_tail,p, r,g,b,a, dr,dg,db,da); \
} \
static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \
const SkNf& d, const SkNf& da)
@@ -280,8 +323,8 @@ SI void from_f16(const void* px, SkNf* r, SkNf* g, SkNf* b, SkNf* a) {
*a = SkHalfToFloat_finite_ftz(ah);
}
-STAGE(trace) {
- SkDebugf("%s\n", (const char*)ctx);
+STAGE_CTX(trace, const char*) {
+ SkDebugf("%s\n", ctx);
}
STAGE(registers) {
auto print = [](const char* name, const SkNf& v) {
@@ -332,11 +375,10 @@ STAGE(premul) {
b *= a;
}
-STAGE(set_rgb) {
- auto rgb = (const float*)ctx;
- r = rgb[0];
- g = rgb[1];
- b = rgb[2];
+STAGE_CTX(set_rgb, const float*) {
+ r = ctx[0];
+ g = ctx[1];
+ b = ctx[2];
}
STAGE(move_src_dst) {
@@ -397,17 +439,16 @@ STAGE(to_2dot2) {
}
// The default shader produces a constant color (from the SkPaint).
-STAGE(constant_color) {
- auto color = (const SkPM4f*)ctx;
- r = color->r();
- g = color->g();
- b = color->b();
- a = color->a();
+STAGE_CTX(constant_color, const SkPM4f*) {
+ r = ctx->r();
+ g = ctx->g();
+ b = ctx->b();
+ a = ctx->a();
}
// s' = sc for a scalar c.
-STAGE(scale_1_float) {
- SkNf c = *(const float*)ctx;
+STAGE_CTX(scale_1_float, const float*) {
+ SkNf c = *ctx;
r *= c;
g *= c;
@@ -415,10 +456,10 @@ STAGE(scale_1_float) {
a *= c;
}
// s' = sc for 8-bit c.
-STAGE(scale_u8) {
- auto ptr = *(const uint8_t**)ctx + x;
-
+STAGE_CTX(scale_u8, const uint8_t**) {
+ auto ptr = *ctx + x;
SkNf c = SkNf_from_byte(load(tail, ptr));
+
r = r*c;
g = g*c;
b = b*c;
@@ -430,8 +471,8 @@ SI SkNf lerp(const SkNf& from, const SkNf& to, const SkNf& cov) {
}
// s' = d(1-c) + sc, for a scalar c.
-STAGE(lerp_1_float) {
- SkNf c = *(const float*)ctx;
+STAGE_CTX(lerp_1_float, const float*) {
+ SkNf c = *ctx;
r = lerp(dr, r, c);
g = lerp(dg, g, c);
@@ -440,10 +481,10 @@ STAGE(lerp_1_float) {
}
// s' = d(1-c) + sc for 8-bit c.
-STAGE(lerp_u8) {
- auto ptr = *(const uint8_t**)ctx + x;
-
+STAGE_CTX(lerp_u8, const uint8_t**) {
+ auto ptr = *ctx + x;
SkNf c = SkNf_from_byte(load(tail, ptr));
+
r = lerp(dr, r, c);
g = lerp(dg, g, c);
b = lerp(db, b, c);
@@ -451,8 +492,8 @@ STAGE(lerp_u8) {
}
// s' = d(1-c) + sc for 565 c.
-STAGE(lerp_565) {
- auto ptr = *(const uint16_t**)ctx + x;
+STAGE_CTX(lerp_565, const uint16_t**) {
+ auto ptr = *ctx + x;
SkNf cr, cg, cb;
from_565(load(tail, ptr), &cr, &cg, &cb);
@@ -462,21 +503,21 @@ STAGE(lerp_565) {
a = 1.0f;
}
-STAGE(load_565) {
- auto ptr = *(const uint16_t**)ctx + x;
+STAGE_CTX(load_565, const uint16_t**) {
+ auto ptr = *ctx + x;
from_565(load(tail, ptr), &r,&g,&b);
a = 1.0f;
}
-STAGE(store_565) {
- auto ptr = *(uint16_t**)ctx + x;
+STAGE_CTX(store_565, uint16_t**) {
+ auto ptr = *ctx + x;
store(tail, SkNx_cast<uint16_t>( SkNf_round(r, SK_R16_MASK) << SK_R16_SHIFT
| SkNf_round(g, SK_G16_MASK) << SK_G16_SHIFT
| SkNf_round(b, SK_B16_MASK) << SK_B16_SHIFT), ptr);
}
-STAGE(load_f16) {
- auto ptr = *(const uint64_t**)ctx + x;
+STAGE_CTX(load_f16, const uint64_t**) {
+ auto ptr = *ctx + x;
const void* src = ptr;
SkNx<N, uint64_t> px;
@@ -486,8 +527,8 @@ STAGE(load_f16) {
}
from_f16(src, &r, &g, &b, &a);
}
-STAGE(store_f16) {
- auto ptr = *(uint64_t**)ctx + x;
+STAGE_CTX(store_f16, uint64_t**) {
+ auto ptr = *ctx + x;
SkNx<N, uint64_t> px;
SkNh::Store4(tail ? (void*)&px : (void*)ptr, SkFloatToHalf_finite_ftz(r),
@@ -499,8 +540,8 @@ STAGE(store_f16) {
}
}
-STAGE(store_f32) {
- auto ptr = *(SkPM4f**)ctx + x;
+STAGE_CTX(store_f32, SkPM4f**) {
+ auto ptr = *ctx + x;
SkNx<N, SkPM4f> px;
SkNf::Store4(tail ? (void*)&px : (void*)ptr, r,g,b,a);
@@ -510,11 +551,11 @@ STAGE(store_f32) {
}
-STAGE(load_8888) {
- auto ptr = *(const uint32_t**)ctx + x;
+STAGE_CTX(load_8888, const uint32_t**) {
+ auto ptr = *ctx + x;
from_8888(load(tail, ptr), &r, &g, &b, &a);
}
-STAGE(store_8888) {
+STAGE_CTX(store_8888, uint32_t**) {
auto byte = [](const SkNf& x, int ix) {
// Here's a neat trick: 0x47000000 == 32768.0f, and 0x470000ff == 32768.0f + (255/256.0f).
auto v = SkNf_fma(255/256.0f, x, 32768.0f);
@@ -525,35 +566,33 @@ STAGE(store_8888) {
return (SkNi::Load(&v) & 0xff) << (8*ix); // B or G
};
- auto ptr = *(uint32_t**)ctx + x;
+ auto ptr = *ctx + x;
store(tail, byte(r,0)|byte(g,1)|byte(b,2)|byte(a,3), (int*)ptr);
}
-STAGE(load_tables) {
- auto loadCtx = (const LoadTablesContext*)ctx;
- auto ptr = loadCtx->fSrc + x;
+STAGE_CTX(load_tables, const LoadTablesContext*) {
+ auto ptr = ctx->fSrc + x;
SkNu rgba = load(tail, ptr);
auto to_int = [](const SkNu& v) { return SkNi::Load(&v); };
- r = gather(tail, loadCtx->fR, to_int((rgba >> 0) & 0xff));
- g = gather(tail, loadCtx->fG, to_int((rgba >> 8) & 0xff));
- b = gather(tail, loadCtx->fB, to_int((rgba >> 16) & 0xff));
+ r = gather(tail, ctx->fR, to_int((rgba >> 0) & 0xff));
+ g = gather(tail, ctx->fG, to_int((rgba >> 8) & 0xff));
+ b = gather(tail, ctx->fB, to_int((rgba >> 16) & 0xff));
a = SkNf_from_byte(rgba >> 24);
}
-STAGE(store_tables) {
- auto storeCtx = (const StoreTablesContext*)ctx;
- auto ptr = storeCtx->fDst + x;
+STAGE_CTX(store_tables, const StoreTablesContext*) {
+ auto ptr = ctx->fDst + x;
- float scale = storeCtx->fCount - 1;
+ float scale = ctx->fCount - 1;
SkNi ri = SkNf_round(scale, r);
SkNi gi = SkNf_round(scale, g);
SkNi bi = SkNf_round(scale, b);
- store(tail, ( SkNx_cast<int>(gather(tail, storeCtx->fR, ri)) << 0
- | SkNx_cast<int>(gather(tail, storeCtx->fG, gi)) << 8
- | SkNx_cast<int>(gather(tail, storeCtx->fB, bi)) << 16
- | SkNf_round(255.0f, a) << 24), (int*)ptr);
+ store(tail, ( SkNx_cast<int>(gather(tail, ctx->fR, ri)) << 0
+ | SkNx_cast<int>(gather(tail, ctx->fG, gi)) << 8
+ | SkNx_cast<int>(gather(tail, ctx->fB, bi)) << 16
+ | SkNf_round(255.0f, a) << 24), (int*)ptr);
}
SI SkNf inv(const SkNf& x) { return 1.0f - x; }
@@ -614,16 +653,16 @@ STAGE(luminance_to_alpha) {
r = g = b = 0;
}
-STAGE(matrix_2x3) {
- auto m = (const float*)ctx;
+STAGE_CTX(matrix_2x3, const float*) {
+ auto m = ctx;
auto R = SkNf_fma(r,m[0], SkNf_fma(g,m[2], m[4])),
G = SkNf_fma(r,m[1], SkNf_fma(g,m[3], m[5]));
r = R;
g = G;
}
-STAGE(matrix_3x4) {
- auto m = (const float*)ctx;
+STAGE_CTX(matrix_3x4, const float*) {
+ auto m = ctx;
auto R = SkNf_fma(r,m[0], SkNf_fma(g,m[3], SkNf_fma(b,m[6], m[ 9]))),
G = SkNf_fma(r,m[1], SkNf_fma(g,m[4], SkNf_fma(b,m[7], m[10]))),
@@ -632,8 +671,8 @@ STAGE(matrix_3x4) {
g = G;
b = B;
}
-STAGE(matrix_4x5) {
- auto m = (const float*)ctx;
+STAGE_CTX(matrix_4x5, const float*) {
+ auto m = ctx;
auto R = SkNf_fma(r,m[0], SkNf_fma(g,m[4], SkNf_fma(b,m[ 8], SkNf_fma(a,m[12], m[16])))),
G = SkNf_fma(r,m[1], SkNf_fma(g,m[5], SkNf_fma(b,m[ 9], SkNf_fma(a,m[13], m[17])))),
@@ -644,9 +683,9 @@ STAGE(matrix_4x5) {
b = B;
a = A;
}
-STAGE(matrix_perspective) {
+STAGE_CTX(matrix_perspective, const float*) {
// N.B. unlike the matrix_NxM stages, this takes a row-major matrix.
- auto m = (const float*)ctx;
+ auto m = ctx;
auto R = SkNf_fma(r,m[0], SkNf_fma(g,m[1], m[2])),
G = SkNf_fma(r,m[3], SkNf_fma(g,m[4], m[5])),
@@ -666,10 +705,10 @@ SI SkNf parametric(const SkNf& v, const SkColorSpaceTransferFn& p) {
// Max(NaN, 0) = 0, but Max(0, NaN) = NaN, so we want this exact order to ensure NaN => 0
return SkNf::Min(SkNf::Max(SkNf::Load(result), 0.0f), 1.0f);
}
-STAGE(parametric_r) { r = parametric(r, *(const SkColorSpaceTransferFn*)ctx); }
-STAGE(parametric_g) { g = parametric(g, *(const SkColorSpaceTransferFn*)ctx); }
-STAGE(parametric_b) { b = parametric(b, *(const SkColorSpaceTransferFn*)ctx); }
-STAGE(parametric_a) { a = parametric(a, *(const SkColorSpaceTransferFn*)ctx); }
+STAGE_CTX(parametric_r, const SkColorSpaceTransferFn*) { r = parametric(r, *ctx); }
+STAGE_CTX(parametric_g, const SkColorSpaceTransferFn*) { g = parametric(g, *ctx); }
+STAGE_CTX(parametric_b, const SkColorSpaceTransferFn*) { b = parametric(b, *ctx); }
+STAGE_CTX(parametric_a, const SkColorSpaceTransferFn*) { a = parametric(a, *ctx); }
SI SkNf table(const SkNf& v, const SkTableTransferFn& table) {
float result[N];
@@ -679,13 +718,13 @@ SI SkNf table(const SkNf& v, const SkTableTransferFn& table) {
// no need to clamp - tables are by-design [0,1] -> [0,1]
return SkNf::Load(result);
}
-STAGE(table_r) { r = table(r, *(const SkTableTransferFn*)ctx); }
-STAGE(table_g) { g = table(g, *(const SkTableTransferFn*)ctx); }
-STAGE(table_b) { b = table(b, *(const SkTableTransferFn*)ctx); }
-STAGE(table_a) { a = table(a, *(const SkTableTransferFn*)ctx); }
+STAGE_CTX(table_r, const SkTableTransferFn*) { r = table(r, *ctx); }
+STAGE_CTX(table_g, const SkTableTransferFn*) { g = table(g, *ctx); }
+STAGE_CTX(table_b, const SkTableTransferFn*) { b = table(b, *ctx); }
+STAGE_CTX(table_a, const SkTableTransferFn*) { a = table(a, *ctx); }
-STAGE(color_lookup_table) {
- const SkColorLookUpTable* colorLUT = (const SkColorLookUpTable*)ctx;
+STAGE_CTX(color_lookup_table, const SkColorLookUpTable*) {
+ const SkColorLookUpTable* colorLUT = ctx;
SkASSERT(3 == colorLUT->inputChannels() || 4 == colorLUT->inputChannels());
SkASSERT(3 == colorLUT->outputChannels());
float result[3][N];
@@ -756,33 +795,29 @@ SI SkNf mirror(const SkNf& v, float l/*imit*/) {
result = SkNf::Min(result, nextafterf(l, 0));
return assert_in_tile(result, l);
}
-STAGE( clamp_x) { r = clamp (r, *(const float*)ctx); }
-STAGE(repeat_x) { r = repeat(r, *(const float*)ctx); }
-STAGE(mirror_x) { r = mirror(r, *(const float*)ctx); }
-STAGE( clamp_y) { g = clamp (g, *(const float*)ctx); }
-STAGE(repeat_y) { g = repeat(g, *(const float*)ctx); }
-STAGE(mirror_y) { g = mirror(g, *(const float*)ctx); }
+STAGE_CTX( clamp_x, const float*) { r = clamp (r, *ctx); }
+STAGE_CTX(repeat_x, const float*) { r = repeat(r, *ctx); }
+STAGE_CTX(mirror_x, const float*) { r = mirror(r, *ctx); }
+STAGE_CTX( clamp_y, const float*) { g = clamp (g, *ctx); }
+STAGE_CTX(repeat_y, const float*) { g = repeat(g, *ctx); }
+STAGE_CTX(mirror_y, const float*) { g = mirror(g, *ctx); }
-STAGE(save_xy) {
- auto sc = (SkImageShaderContext*)ctx;
-
- r.store(sc->x);
- g.store(sc->y);
+STAGE_CTX(save_xy, SkImageShaderContext*) {
+ r.store(ctx->x);
+ g.store(ctx->y);
// Whether bilinear or bicubic, all sample points have the same fractional offset (fx,fy).
// They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid
// surrounding (x,y), all (0.5,0.5) off-center.
auto fract = [](const SkNf& v) { return v - v.floor(); };
- fract(r + 0.5f).store(sc->fx);
- fract(g + 0.5f).store(sc->fy);
+ fract(r + 0.5f).store(ctx->fx);
+ fract(g + 0.5f).store(ctx->fy);
}
-STAGE(accumulate) {
- auto sc = (const SkImageShaderContext*)ctx;
-
+STAGE_CTX(accumulate, const SkImageShaderContext*) {
// Bilinear and bicubic filtering are both separable, so we'll end up with independent
// scale contributions in x and y that we multiply together to get each pixel's scale factor.
- auto scale = SkNf::Load(sc->scalex) * SkNf::Load(sc->scaley);
+ auto scale = SkNf::Load(ctx->scalex) * SkNf::Load(ctx->scaley);
dr = SkNf_fma(scale, r, dr);
dg = SkNf_fma(scale, g, dg);
db = SkNf_fma(scale, b, db);
@@ -794,25 +829,21 @@ STAGE(accumulate) {
// At positive offsets, the x-axis contribution to that rectangular area is fx; (1-fx)
// at negative x offsets. The y-axis is treated symmetrically.
template <int Scale>
-SI void bilinear_x(void* ctx, SkNf* x) {
- auto sc = (SkImageShaderContext*)ctx;
-
- *x = SkNf::Load(sc->x) + Scale*0.5f;
- auto fx = SkNf::Load(sc->fx);
- (Scale > 0 ? fx : (1.0f - fx)).store(sc->scalex);
+SI void bilinear_x(SkImageShaderContext* ctx, SkNf* x) {
+ *x = SkNf::Load(ctx->x) + Scale*0.5f;
+ auto fx = SkNf::Load(ctx->fx);
+ (Scale > 0 ? fx : (1.0f - fx)).store(ctx->scalex);
}
template <int Scale>
-SI void bilinear_y(void* ctx, SkNf* y) {
- auto sc = (SkImageShaderContext*)ctx;
-
- *y = SkNf::Load(sc->y) + Scale*0.5f;
- auto fy = SkNf::Load(sc->fy);
- (Scale > 0 ? fy : (1.0f - fy)).store(sc->scaley);
+SI void bilinear_y(SkImageShaderContext* ctx, SkNf* y) {
+ *y = SkNf::Load(ctx->y) + Scale*0.5f;
+ auto fy = SkNf::Load(ctx->fy);
+ (Scale > 0 ? fy : (1.0f - fy)).store(ctx->scaley);
}
-STAGE(bilinear_nx) { bilinear_x<-1>(ctx, &r); }
-STAGE(bilinear_px) { bilinear_x<+1>(ctx, &r); }
-STAGE(bilinear_ny) { bilinear_y<-1>(ctx, &g); }
-STAGE(bilinear_py) { bilinear_y<+1>(ctx, &g); }
+STAGE_CTX(bilinear_nx, SkImageShaderContext*) { bilinear_x<-1>(ctx, &r); }
+STAGE_CTX(bilinear_px, SkImageShaderContext*) { bilinear_x<+1>(ctx, &r); }
+STAGE_CTX(bilinear_ny, SkImageShaderContext*) { bilinear_y<-1>(ctx, &g); }
+STAGE_CTX(bilinear_py, SkImageShaderContext*) { bilinear_y<+1>(ctx, &g); }
// In bilinear interpolation, the 16 pixels at +/- 0.5 and +/- 1.5 offsets from the sample
@@ -833,94 +864,87 @@ SI SkNf bicubic_far(const SkNf& t) {
}
template <int Scale>
-SI void bicubic_x(void* ctx, SkNf* x) {
- auto sc = (SkImageShaderContext*)ctx;
-
- *x = SkNf::Load(sc->x) + Scale*0.5f;
- auto fx = SkNf::Load(sc->fx);
- if (Scale == -3) { return bicubic_far (1.0f - fx).store(sc->scalex); }
- if (Scale == -1) { return bicubic_near(1.0f - fx).store(sc->scalex); }
- if (Scale == +1) { return bicubic_near( fx).store(sc->scalex); }
- if (Scale == +3) { return bicubic_far ( fx).store(sc->scalex); }
+SI void bicubic_x(SkImageShaderContext* ctx, SkNf* x) {
+ *x = SkNf::Load(ctx->x) + Scale*0.5f;
+ auto fx = SkNf::Load(ctx->fx);
+ if (Scale == -3) { return bicubic_far (1.0f - fx).store(ctx->scalex); }
+ if (Scale == -1) { return bicubic_near(1.0f - fx).store(ctx->scalex); }
+ if (Scale == +1) { return bicubic_near( fx).store(ctx->scalex); }
+ if (Scale == +3) { return bicubic_far ( fx).store(ctx->scalex); }
SkDEBUGFAIL("unreachable");
}
template <int Scale>
-SI void bicubic_y(void* ctx, SkNf* y) {
- auto sc = (SkImageShaderContext*)ctx;
-
- *y = SkNf::Load(sc->y) + Scale*0.5f;
- auto fy = SkNf::Load(sc->fy);
- if (Scale == -3) { return bicubic_far (1.0f - fy).store(sc->scaley); }
- if (Scale == -1) { return bicubic_near(1.0f - fy).store(sc->scaley); }
- if (Scale == +1) { return bicubic_near( fy).store(sc->scaley); }
- if (Scale == +3) { return bicubic_far ( fy).store(sc->scaley); }
+SI void bicubic_y(SkImageShaderContext* ctx, SkNf* y) {
+ *y = SkNf::Load(ctx->y) + Scale*0.5f;
+ auto fy = SkNf::Load(ctx->fy);
+ if (Scale == -3) { return bicubic_far (1.0f - fy).store(ctx->scaley); }
+ if (Scale == -1) { return bicubic_near(1.0f - fy).store(ctx->scaley); }
+ if (Scale == +1) { return bicubic_near( fy).store(ctx->scaley); }
+ if (Scale == +3) { return bicubic_far ( fy).store(ctx->scaley); }
SkDEBUGFAIL("unreachable");
}
-STAGE(bicubic_n3x) { bicubic_x<-3>(ctx, &r); }
-STAGE(bicubic_n1x) { bicubic_x<-1>(ctx, &r); }
-STAGE(bicubic_p1x) { bicubic_x<+1>(ctx, &r); }
-STAGE(bicubic_p3x) { bicubic_x<+3>(ctx, &r); }
+STAGE_CTX(bicubic_n3x, SkImageShaderContext*) { bicubic_x<-3>(ctx, &r); }
+STAGE_CTX(bicubic_n1x, SkImageShaderContext*) { bicubic_x<-1>(ctx, &r); }
+STAGE_CTX(bicubic_p1x, SkImageShaderContext*) { bicubic_x<+1>(ctx, &r); }
+STAGE_CTX(bicubic_p3x, SkImageShaderContext*) { bicubic_x<+3>(ctx, &r); }
-STAGE(bicubic_n3y) { bicubic_y<-3>(ctx, &g); }
-STAGE(bicubic_n1y) { bicubic_y<-1>(ctx, &g); }
-STAGE(bicubic_p1y) { bicubic_y<+1>(ctx, &g); }
-STAGE(bicubic_p3y) { bicubic_y<+3>(ctx, &g); }
+STAGE_CTX(bicubic_n3y, SkImageShaderContext*) { bicubic_y<-3>(ctx, &g); }
+STAGE_CTX(bicubic_n1y, SkImageShaderContext*) { bicubic_y<-1>(ctx, &g); }
+STAGE_CTX(bicubic_p1y, SkImageShaderContext*) { bicubic_y<+1>(ctx, &g); }
+STAGE_CTX(bicubic_p3y, SkImageShaderContext*) { bicubic_y<+3>(ctx, &g); }
template <typename T>
-SI SkNi offset_and_ptr(T** ptr, const void* ctx, const SkNf& x, const SkNf& y) {
- auto sc = (const SkImageShaderContext*)ctx;
-
+SI SkNi offset_and_ptr(T** ptr, const SkImageShaderContext* ctx, const SkNf& x, const SkNf& y) {
SkNi ix = SkNx_cast<int>(x),
iy = SkNx_cast<int>(y);
- SkNi offset = iy*sc->stride + ix;
+ SkNi offset = iy*ctx->stride + ix;
- *ptr = (const T*)sc->pixels;
+ *ptr = (const T*)ctx->pixels;
return offset;
}
-STAGE(gather_a8) {
+STAGE_CTX(gather_a8, const SkImageShaderContext*) {
const uint8_t* p;
SkNi offset = offset_and_ptr(&p, ctx, r, g);
r = g = b = 0.0f;
a = SkNf_from_byte(gather(tail, p, offset));
}
-STAGE(gather_i8) {
- auto sc = (const SkImageShaderContext*)ctx;
+STAGE_CTX(gather_i8, const SkImageShaderContext*) {
const uint8_t* p;
- SkNi offset = offset_and_ptr(&p, sc, r, g);
+ SkNi offset = offset_and_ptr(&p, ctx, r, g);
SkNi ix = SkNx_cast<int>(gather(tail, p, offset));
- from_8888(gather(tail, sc->ctable->readColors(), ix), &r, &g, &b, &a);
+ from_8888(gather(tail, ctx->ctable->readColors(), ix), &r, &g, &b, &a);
}
-STAGE(gather_g8) {
+STAGE_CTX(gather_g8, const SkImageShaderContext*) {
const uint8_t* p;
SkNi offset = offset_and_ptr(&p, ctx, r, g);
r = g = b = SkNf_from_byte(gather(tail, p, offset));
a = 1.0f;
}
-STAGE(gather_565) {
+STAGE_CTX(gather_565, const SkImageShaderContext*) {
const uint16_t* p;
SkNi offset = offset_and_ptr(&p, ctx, r, g);
from_565(gather(tail, p, offset), &r, &g, &b);
a = 1.0f;
}
-STAGE(gather_4444) {
+STAGE_CTX(gather_4444, const SkImageShaderContext*) {
const uint16_t* p;
SkNi offset = offset_and_ptr(&p, ctx, r, g);
from_4444(gather(tail, p, offset), &r, &g, &b, &a);
}
-STAGE(gather_8888) {
+STAGE_CTX(gather_8888, const SkImageShaderContext*) {
const uint32_t* p;
SkNi offset = offset_and_ptr(&p, ctx, r, g);
from_8888(gather(tail, p, offset), &r, &g, &b, &a);
}
-STAGE(gather_f16) {
+STAGE_CTX(gather_f16, const SkImageShaderContext*) {
const uint64_t* p;
SkNi offset = offset_and_ptr(&p, ctx, r, g);
@@ -940,18 +964,37 @@ SI Fn enum_to_Fn(SkRasterPipeline::StockStage st) {
}
namespace {
+
+ // Compiled manages its memory manually because it's not safe to use
+ // std::vector, SkTDArray, etc without setting us up for big ODR violations.
+
struct Compiled {
- Compiled(const SkRasterPipeline::Stage* stages, int nstages) : fStages(nstages) {
- if (nstages == 0) {
- return;
+ Compiled(const SkRasterPipeline::Stage* stages, int nstages) {
+ int slots = nstages + 1; // One extra for just_return.
+ for (int i = 0; i < nstages; i++) {
+ if (stages[i].ctx) {
+ slots++;
+ }
}
- fStart = enum_to_Fn(stages[0].stage);
- for (int i = 0; i < nstages-1; i++) {
- fStages[i].next = enum_to_Fn(stages[i+1].stage);
- fStages[i].ctx = stages[i].ctx;
+ fProgram = (void**)sk_malloc_throw(slots * sizeof(void*));
+
+ void** ip = fProgram;
+ for (int i = 0; i < nstages; i++) {
+ *ip++ = (void*)enum_to_Fn(stages[i].stage);
+ if (stages[i].ctx) {
+ *ip++ = stages[i].ctx;
+ }
}
- fStages[nstages-1].next = just_return;
- fStages[nstages-1].ctx = stages[nstages-1].ctx;
+ *ip++ = (void*)just_return;
+ }
+ ~Compiled() { sk_free(fProgram); }
+
+ Compiled(const Compiled& o) {
+ int slots = 0;
+ while (o.fProgram[slots++] != (void*)just_return);
+
+ fProgram = (void**)sk_malloc_throw(slots * sizeof(void*));
+ memcpy(fProgram, o.fProgram, slots * sizeof(void*));
}
void operator()(size_t x, size_t y, size_t n) {
@@ -961,19 +1004,20 @@ namespace {
_0 = SkNf(0),
_1 = SkNf(1);
+ void** p = fProgram;
+ auto start = (Fn)load_and_increment(&p);
while (n >= N) {
- fStart(fStages.data(), x*N, X,Y,_1,_0, _0,_0,_0,_0);
+ start(x*N, p, X,Y,_1,_0, _0,_0,_0,_0);
X += (float)N;
x += N;
n -= N;
}
if (n) {
- fStart(fStages.data(), x*N+n, X,Y,_1,_0, _0,_0,_0,_0);
+ start(x*N+n, p, X,Y,_1,_0, _0,_0,_0,_0);
}
}
- Fn fStart = just_return;
- std::vector<Stage> fStages;
+ void** fProgram;
};
}
@@ -993,6 +1037,7 @@ namespace SK_OPTS_NS {
#undef SI
#undef STAGE
+#undef STAGE_CTX
#undef RGBA_XFERMODE
#undef RGB_XFERMODE