aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-08-29 19:49:50 -0400
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-08-30 13:02:12 +0000
commit4c6024afcce54cc775810b5f8ebccbcdbc6a43d7 (patch)
treeecb39a6903f0d5fb72b08be76cc08b55cde90206 /src/jumper
parent380b90c92bead0c1b869df08b2d2790c302c5c71 (diff)
improve ARMv7 8-bit codegen
We need to make two changes to keep all values in registers: 1) pass raw U8 values instead of V structs that wrap them 2) switch to aapcs-vfp, which allows exactly 8x U8 arguments Code generation goes from ridiculous looking to lovely. Change-Id: Ibd53bdd86345b59bd987a1f79205645d80c5cbc3 Reviewed-on: https://skia-review.googlesource.com/40021 Commit-Queue: Mike Klein <mtklein@google.com> Reviewed-by: Florin Malita <fmalita@chromium.org>
Diffstat (limited to 'src/jumper')
-rw-r--r--src/jumper/SkJumper_stages_8bit.cpp43
1 files changed, 27 insertions, 16 deletions
diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp
index f76634a5b7..9ab1c28f67 100644
--- a/src/jumper/SkJumper_stages_8bit.cpp
+++ b/src/jumper/SkJumper_stages_8bit.cpp
@@ -168,7 +168,7 @@ SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int x, int y) {
struct Params { size_t x,y,tail; };
using Stage =
- void(const Params* params, void** program, R src_lo, R src_hi, R dst_lo, R dst_hi);
+ void(*)(const Params* params, void** program, R src_lo, R src_hi, R dst_lo, R dst_hi);
#if defined(__AVX__)
// We really want to make sure all paths go through this function's (implicit) vzeroupper.
@@ -183,7 +183,7 @@ SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int x, int y) {
#else
R r{}; // Next best is zero'd for compilers that will complain about uninitialized values.
#endif
- auto start = (Stage*)load_and_inc(program);
+ auto start = (Stage)load_and_inc(program);
for (; y < ylimit; y++) {
Params params = { x,y,0 };
while (params.x + kStride <= xlimit) {
@@ -209,7 +209,7 @@ SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int x, int y) {
name##_k(ctx, params->x, params->y, params->tail, src, dst); \
split(src.u8x4, &src_lo, &src_hi); \
split(dst.u8x4, &dst_lo, &dst_hi); \
- auto next = (Stage*)load_and_inc(program); \
+ auto next = (Stage)load_and_inc(program); \
next(params,program, src_lo,src_hi, dst_lo,dst_hi); \
} \
SI void name##_k(LazyCtx ctx, size_t x, size_t y, size_t tail, V& src, V& dst)
@@ -458,6 +458,13 @@ SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int x, int y) {
#elif defined(JUMPER_HAS_NEON_8BIT) // These are generally compiled as part of Skia.
#include <arm_neon.h>
+
+ #if defined(__arm__)
+ #define ABI __attribute__((pcs("aapcs-vfp")))
+ #else
+ #define ABI
+ #endif
+
#define WRAP(name) sk_##name##_8bit
// On ARM it's so easy to de-interlace on loads and re-interlace on stores that
@@ -505,14 +512,15 @@ SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int x, int y) {
SI V min(V a, V b) { return if_then_else(a > b, b.vec, a.vec); }
- using Stage = void(void** program, size_t x, size_t y, size_t tail,
- V r, V g, V b, V a,
- V dr, V dg, V db, V da);
+ // We need to pass as U8 (raw vector types unwrapped by any struct) to appease ARMv7's ABI.
+ using Stage = void (ABI *)(void** program, size_t x, size_t y, size_t tail,
+ U8 r, U8 g, U8 b, U8 a,
+ U8 dr, U8 dg, U8 db, U8 da);
- extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit,
- void** program) {
+ ABI extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit,
+ void** program) {
V v{};
- auto start = (Stage*)load_and_inc(program);
+ auto start = (Stage)load_and_inc(program);
const size_t x0 = x;
for (; y < ylimit; y++) {
x = x0;
@@ -526,19 +534,22 @@ SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int x, int y) {
}
}
- extern "C" void WRAP(just_return)(void**,size_t,size_t,size_t, V,V,V,V, V,V,V,V) {}
+ ABI extern "C" void WRAP(just_return)(void**,size_t,size_t,size_t,
+ U8,U8,U8,U8, U8,U8,U8,U8) {}
#define STAGE(name) \
SI void name##_k(LazyCtx ctx, size_t x, size_t y, size_t tail, \
V& r, V& g, V& b, V& a, \
V& dr, V& dg, V& db, V& da); \
- extern "C" void WRAP(name)(void** program, size_t x, size_t y, size_t tail, \
- V r, V g, V b, V a, \
- V dr, V dg, V db, V da) { \
+ ABI extern "C" void WRAP(name)(void** program, size_t x, size_t y, size_t tail, \
+ U8 r, U8 g, U8 b, U8 a, \
+ U8 dr, U8 dg, U8 db, U8 da) { \
LazyCtx ctx(program); \
- name##_k(ctx,x,y,tail, r,g,b,a, dr,dg,db,da); \
- auto next = (Stage*)load_and_inc(program); \
- next(program, x,y,tail, r,g,b,a, dr,dg,db,da); \
+ V R = r, G = g, B = b, A = a, \
+ DR = dr, DG = dg, DB = db, DA = da; \
+ name##_k(ctx,x,y,tail, R,G,B,A, DR,DG,DB,DA); \
+ auto next = (Stage)load_and_inc(program); \
+ next(program, x,y,tail, R,G,B,A, DR,DG,DB,DA); \
} \
SI void name##_k(LazyCtx ctx, size_t x, size_t y, size_t tail, \
V& r, V& g, V& b, V& a, \