2 files changed, 99 insertions, 99 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index d192728419..1f9848e1fa 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -47,40 +47,40 @@ static const size_t N = sizeof(F) / sizeof(float);
     // General-purpose registers are also tight, so we put most of those on the stack too.
     // On ARMv7, we do the same so that we can make the r,g,b,a vectors wider.
     struct Params {
-        size_t x, y, tail;
+        size_t dx, dy, tail;
         F dr,dg,db,da;
     };
     using Stage = void(Params*, void** program, F r, F g, F b, F a);
 
 #else
     // We keep program the second argument, so that it's passed in rsi for load_and_inc().
-    using Stage = void(size_t tail, void** program, size_t x, size_t y, F,F,F,F, F,F,F,F);
+    using Stage = void(size_t tail, void** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F);
 #endif
 
 MAYBE_MSABI
-extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit,
+extern "C" void WRAP(start_pipeline)(size_t dx, size_t dy, size_t xlimit, size_t ylimit,
                                      void** program) {
     auto start = (Stage*)load_and_inc(program);
-    const size_t x0 = x;
-    for (; y < ylimit; y++) {
+    const size_t x0 = dx;
+    for (; dy < ylimit; dy++) {
     #if defined(__i386__) || defined(_M_IX86) || defined(__arm__)
-        Params params = { x0,y,0, 0,0,0,0 };
-        while (params.x + N <= xlimit) {
+        Params params = { x0,dy,0, 0,0,0,0 };
+        while (params.dx + N <= xlimit) {
             start(&params,program, 0,0,0,0);
-            params.x += N;
+            params.dx += N;
         }
-        if (size_t tail = xlimit - params.x) {
+        if (size_t tail = xlimit - params.dx) {
             params.tail = tail;
             start(&params,program, 0,0,0,0);
         }
     #else
-        x = x0;
-        while (x + N <= xlimit) {
-            start(0,program,x,y,    0,0,0,0, 0,0,0,0);
-            x += N;
+        dx = x0;
+        while (dx + N <= xlimit) {
+            start(0,program,dx,dy,    0,0,0,0, 0,0,0,0);
+            dx += N;
         }
-        if (size_t tail = xlimit - x) {
-            start(tail,program,x,y, 0,0,0,0, 0,0,0,0);
+        if (size_t tail = xlimit - dx) {
+            start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
         }
     #endif
     }
@@ -88,28 +88,28 @@ extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t y
 
 #if defined(__i386__) || defined(_M_IX86) || defined(__arm__)
     #define STAGE(name, ...)                                                          \
-        SI void name##_k(__VA_ARGS__, size_t x, size_t y, size_t tail,                \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,              \
                          F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);         \
         extern "C" void WRAP(name)(Params* params, void** program,                    \
                                    F r, F g, F b, F a) {                              \
-            name##_k(Ctx{program},params->x,params->y,params->tail, r,g,b,a,          \
+            name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a,        \
                      params->dr, params->dg, params->db, params->da);                 \
             auto next = (Stage*)load_and_inc(program);                                \
             next(params,program, r,g,b,a);                                            \
         }                                                                             \
-        SI void name##_k(__VA_ARGS__, size_t x, size_t y, size_t tail,                \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,              \
                          F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
 #else
     #define STAGE(name, ...)                                                          \
-        SI void name##_k(__VA_ARGS__, size_t x, size_t y, size_t tail,                \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,              \
                          F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);         \
-        extern "C" void WRAP(name)(size_t tail, void** program, size_t x, size_t y,   \
+        extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \
                                    F r, F g, F b, F a, F dr, F dg, F db, F da) {      \
-            name##_k(Ctx{program},x,y,tail, r,g,b,a, dr,dg,db,da);                    \
+            name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da);                  \
             auto next = (Stage*)load_and_inc(program);                                \
-            next(tail,program,x,y, r,g,b,a, dr,dg,db,da);                             \
+            next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                           \
         }                                                                             \
-        SI void name##_k(__VA_ARGS__, size_t x, size_t y, size_t tail,                \
+        SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,              \
                          F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
 #endif
 
@@ -192,10 +192,10 @@ SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) {
     *a = cast((_8888 >> 24)       ) * (1/255.0f);
 }
 
-// Used by load_ and store_ stages to get to the right (x,y) starting point of contiguous memory.
+// Used by load_ and store_ stages to get to the right (dx,dy) starting point of contiguous memory.
 template <typename T>
-SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int x, int y) {
-    return (T*)ctx->pixels + y*ctx->stride + x;
+SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, int dx, int dy) {
+    return (T*)ctx->pixels + dy*ctx->stride + dx;
 }
 
 // Used by gather_ stages to calculate the base pointer and a vector of indices to load.
@@ -215,22 +215,22 @@ SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) {
 // Now finally, normal Stages!
 
 STAGE(seed_shader, Ctx::None) {
-    // It's important for speed to explicitly cast(x) and cast(y),
+    // It's important for speed to explicitly cast(dx) and cast(dy),
     // which has the effect of splatting them to vectors before converting to floats.
     // On Intel this breaks a data dependency on previous loop iterations' registers.
     float iota[] = { 0.5f,1.5f,2.5f,3.5f,4.5f,5.5f,6.5f,7.5f };
-    r = cast(x) + unaligned_load<F>(iota);
-    g = cast(y) + 0.5f;
+    r = cast(dx) + unaligned_load<F>(iota);
+    g = cast(dy) + 0.5f;
     b = 1.0f;
     a = 0;
     dr = dg = db = da = 0;
 }
 
 STAGE(dither, const float* rate) {
-    // Get [(x,y), (x+1,y), (x+2,y), ...] loaded up in integer vectors.
+    // Get [(dx,dy), (dx+1,dy), (dx+2,dy), ...] loaded up in integer vectors.
     uint32_t iota[] = {0,1,2,3,4,5,6,7};
-    U32 X = x + unaligned_load<U32>(iota),
-        Y = y;
+    U32 X = dx + unaligned_load<U32>(iota),
+        Y = dy;
 
     // We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering.
     // In this case n=8 and we're using the matrix that looks like 1/64 x [ 0 48 12 60 ... ].
@@ -480,7 +480,7 @@ STAGE(luminosity, Ctx::None) {
 }
 
 STAGE(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<uint32_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
 
     U32 dst = load<U32>(ptr, tail);
     dr = cast((dst      ) & 0xff);
@@ -679,7 +679,7 @@ STAGE(scale_1_float, const float* c) {
     a = a * *c;
 }
 STAGE(scale_u8, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
 
     auto scales = load<U8>(ptr, tail);
     auto c = from_byte(scales);
@@ -690,7 +690,7 @@ STAGE(scale_u8, const SkJumper_MemoryCtx* ctx) {
     a = a * c;
 }
 STAGE(scale_565, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint16_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
 
     F cr,cg,cb;
     from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
@@ -714,7 +714,7 @@ STAGE(lerp_1_float, const float* c) {
     a = lerp(da, a, *c);
 }
 STAGE(lerp_u8, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
 
     auto scales = load<U8>(ptr, tail);
     auto c = from_byte(scales);
@@ -725,7 +725,7 @@ STAGE(lerp_u8, const SkJumper_MemoryCtx* ctx) {
     a = lerp(da, a, c);
 }
 STAGE(lerp_565, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint16_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
 
     F cr,cg,cb;
     from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
@@ -739,14 +739,14 @@ STAGE(lerp_565, const SkJumper_MemoryCtx* ctx) {
 }
 
 STAGE(load_tables, const SkJumper_LoadTablesCtx* c) {
-    auto px = load<U32>((const uint32_t*)c->src + x, tail);
+    auto px = load<U32>((const uint32_t*)c->src + dx, tail);
     r = gather(c->r, (px      ) & 0xff);
     g = gather(c->g, (px >>  8) & 0xff);
     b = gather(c->b, (px >> 16) & 0xff);
     a = cast(        (px >> 24)) * (1/255.0f);
 }
 STAGE(load_tables_u16_be, const SkJumper_LoadTablesCtx* c) {
-    auto ptr = (const uint16_t*)c->src + 4*x;
+    auto ptr = (const uint16_t*)c->src + 4*dx;
 
     U16 R,G,B,A;
     load4(ptr, tail, &R,&G,&B,&A);
@@ -758,7 +758,7 @@ STAGE(load_tables_u16_be, const SkJumper_LoadTablesCtx* c) {
     a = (1/65535.0f) * cast(expand(bswap(A)));
 }
 STAGE(load_tables_rgb_u16_be, const SkJumper_LoadTablesCtx* c) {
-    auto ptr = (const uint16_t*)c->src + 3*x;
+    auto ptr = (const uint16_t*)c->src + 3*dx;
 
     U16 R,G,B;
     load3(ptr, tail, &R,&G,&B);
@@ -834,13 +834,13 @@ STAGE(lab_to_xyz, Ctx::None) {
 }
 
 STAGE(load_a8, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
 
     r = g = b = 0.0f;
     a = from_byte(load<U8>(ptr, tail));
 }
 STAGE(load_a8_dst, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
 
     dr = dg = db = 0.0f;
     da = from_byte(load<U8>(ptr, tail));
@@ -852,20 +852,20 @@ STAGE(gather_a8, const SkJumper_GatherCtx* ctx) {
     a = from_byte(gather(ptr, ix));
 }
 STAGE(store_a8, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<uint8_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy);
 
     U8 packed = pack(pack(round(a, 255.0f)));
     store(ptr, packed, tail);
 }
 
 STAGE(load_g8, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
 
     r = g = b = from_byte(load<U8>(ptr, tail));
     a = 1.0f;
 }
 STAGE(load_g8_dst, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint8_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
 
     dr = dg = db = from_byte(load<U8>(ptr, tail));
     da = 1.0f;
@@ -878,13 +878,13 @@ STAGE(gather_g8, const SkJumper_GatherCtx* ctx) {
 }
 
 STAGE(load_565, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint16_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
 
     from_565(load<U16>(ptr, tail), &r,&g,&b);
     a = 1.0f;
 }
 STAGE(load_565_dst, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint16_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
 
     from_565(load<U16>(ptr, tail), &dr,&dg,&db);
     da = 1.0f;
@@ -896,7 +896,7 @@ STAGE(gather_565, const SkJumper_GatherCtx* ctx) {
     a = 1.0f;
 }
 STAGE(store_565, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<uint16_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
 
     U16 px = pack( round(r, 31.0f) << 11
                  | round(g, 63.0f) <<  5
@@ -905,11 +905,11 @@ STAGE(store_565, const SkJumper_MemoryCtx* ctx) {
 }
 
 STAGE(load_4444, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint16_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
     from_4444(load<U16>(ptr, tail), &r,&g,&b,&a);
 }
 STAGE(load_4444_dst, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint16_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
     from_4444(load<U16>(ptr, tail), &dr,&dg,&db,&da);
 }
 STAGE(gather_4444, const SkJumper_GatherCtx* ctx) {
@@ -918,7 +918,7 @@ STAGE(gather_4444, const SkJumper_GatherCtx* ctx) {
     from_4444(gather(ptr, ix), &r,&g,&b,&a);
 }
 STAGE(store_4444, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<uint16_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
     U16 px = pack( round(r, 15.0f) << 12
                  | round(g, 15.0f) <<  8
                  | round(b, 15.0f) <<  4
@@ -927,11 +927,11 @@ STAGE(store_4444, const SkJumper_MemoryCtx* ctx) {
 }
 
 STAGE(load_8888, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint32_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
     from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
 }
 STAGE(load_8888_dst, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint32_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
     from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da);
 }
 STAGE(gather_8888, const SkJumper_GatherCtx* ctx) {
@@ -940,7 +940,7 @@ STAGE(gather_8888, const SkJumper_GatherCtx* ctx) {
     from_8888(gather(ptr, ix), &r,&g,&b,&a);
 }
 STAGE(store_8888, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<uint32_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
 
     U32 px = round(r, 255.0f)
            | round(g, 255.0f) <<  8
@@ -950,11 +950,11 @@ STAGE(store_8888, const SkJumper_MemoryCtx* ctx) {
 }
 
 STAGE(load_bgra, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint32_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
     from_8888(load<U32>(ptr, tail), &b,&g,&r,&a);
 }
 STAGE(load_bgra_dst, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint32_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
     from_8888(load<U32>(ptr, tail), &db,&dg,&dr,&da);
 }
 STAGE(gather_bgra, const SkJumper_GatherCtx* ctx) {
@@ -963,7 +963,7 @@ STAGE(gather_bgra, const SkJumper_GatherCtx* ctx) {
     from_8888(gather(ptr, ix), &b,&g,&r,&a);
 }
 STAGE(store_bgra, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<uint32_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
 
     U32 px = round(b, 255.0f)
            | round(g, 255.0f) <<  8
@@ -973,7 +973,7 @@ STAGE(store_bgra, const SkJumper_MemoryCtx* ctx) {
 }
 
 STAGE(load_f16, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint64_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
 
     U16 R,G,B,A;
     load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
@@ -983,7 +983,7 @@ STAGE(load_f16, const SkJumper_MemoryCtx* ctx) {
     a = from_half(A);
 }
 STAGE(load_f16_dst, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint64_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
 
     U16 R,G,B,A;
     load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
@@ -1005,7 +1005,7 @@ STAGE(gather_f16, const SkJumper_GatherCtx* ctx) {
     a = from_half(A);
 }
 STAGE(store_f16, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<uint64_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<uint64_t>(ctx, dx,dy);
     store4((uint16_t*)ptr,tail, to_half(r)
                               , to_half(g)
                               , to_half(b)
@@ -1013,7 +1013,7 @@ STAGE(store_f16, const SkJumper_MemoryCtx* ctx) {
 }
 
 STAGE(load_u16_be, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint16_t>(ctx, 4*x,y);
+    auto ptr = ptr_at_xy<const uint16_t>(ctx, 4*dx,dy);
 
     U16 R,G,B,A;
     load4(ptr,tail, &R,&G,&B,&A);
@@ -1024,7 +1024,7 @@ STAGE(load_u16_be, const SkJumper_MemoryCtx* ctx) {
     a = (1/65535.0f) * cast(expand(bswap(A)));
 }
 STAGE(load_rgb_u16_be, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const uint16_t>(ctx, 3*x,y);
+    auto ptr = ptr_at_xy<const uint16_t>(ctx, 3*dx,dy);
 
     U16 R,G,B;
     load3(ptr,tail, &R,&G,&B);
@@ -1035,7 +1035,7 @@ STAGE(load_rgb_u16_be, const SkJumper_MemoryCtx* ctx) {
     a = 1.0f;
 }
 STAGE(store_u16_be, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<uint16_t>(ctx, 4*x,y);
+    auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,dy);
 
     U16 R = bswap(pack(round(r, 65535.0f))),
         G = bswap(pack(round(g, 65535.0f))),
@@ -1046,15 +1046,15 @@ STAGE(store_u16_be, const SkJumper_MemoryCtx* ctx) {
 }
 
 STAGE(load_f32, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const float>(ctx, 4*x,y);
+    auto ptr = ptr_at_xy<const float>(ctx, 4*dx,dy);
     load4(ptr,tail, &r,&g,&b,&a);
 }
 STAGE(load_f32_dst, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<const float>(ctx, 4*x,y);
+    auto ptr = ptr_at_xy<const float>(ctx, 4*dx,dy);
     load4(ptr,tail, &dr,&dg,&db,&da);
 }
 STAGE(store_f32, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<float>(ctx, 4*x,y);
+    auto ptr = ptr_at_xy<float>(ctx, 4*dx,dy);
     store4(ptr,tail, r,g,b,a);
 }
 
diff --git a/src/jumper/SkJumper_stages_lowp.cpp b/src/jumper/SkJumper_stages_lowp.cpp
index 0882f81cf5..79e3fbd0d7 100644
--- a/src/jumper/SkJumper_stages_lowp.cpp
+++ b/src/jumper/SkJumper_stages_lowp.cpp
@@ -53,7 +53,7 @@
 static const size_t N = sizeof(U16) / sizeof(uint16_t);
 
 // We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64.
-using Stage = void (ABI*)(size_t tail, void** program, size_t x, size_t y,
+using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
                           U16  r, U16  g, U16  b, U16  a,
                           U16 dr, U16 dg, U16 db, U16 da);
 
@@ -64,13 +64,13 @@ ABI extern "C" void WRAP(start_pipeline)(const size_t x0,
                                          const size_t ylimit,
                                          void** program) {
     auto start = (Stage)load_and_inc(program);
-    for (size_t y = y0; y < ylimit; y++) {
-        size_t x = x0;
-        for (; x + N <= xlimit; x += N) {
-            start(   0,program,x,y, 0,0,0,0, 0,0,0,0);
+    for (size_t dy = y0; dy < ylimit; dy++) {
+        size_t dx = x0;
+        for (; dx + N <= xlimit; dx += N) {
+            start(   0,program,dx,dy, 0,0,0,0, 0,0,0,0);
         }
-        if (size_t tail = xlimit - x) {
-            start(tail,program,x,y, 0,0,0,0, 0,0,0,0);
+        if (size_t tail = xlimit - dx) {
+            start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
         }
     }
 }
@@ -79,17 +79,17 @@ ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t,
                                       U16,U16,U16,U16, U16,U16,U16,U16) {}
 
 #define STAGE(name, ...)                                                               \
-    SI void name##_k(__VA_ARGS__, size_t x, size_t y, size_t tail,                     \
+    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
                      U16&  r, U16&  g, U16&  b, U16&  a,                               \
                      U16& dr, U16& dg, U16& db, U16& da);                              \
-    ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t x, size_t y,    \
+    ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
                                    U16  r, U16  g, U16  b, U16  a,                     \
                                    U16 dr, U16 dg, U16 db, U16 da) {                   \
-        name##_k(Ctx{program}, x,y,tail, r,g,b,a, dr,dg,db,da);                        \
+        name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da);                      \
         auto next = (Stage)load_and_inc(program);                                      \
-        next(tail,program,x,y, r,g,b,a, dr,dg,db,da);                                  \
+        next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
     }                                                                                  \
-    SI void name##_k(__VA_ARGS__, size_t x, size_t y, size_t tail,                     \
+    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
                      U16&  r, U16&  g, U16&  b, U16&  a,                               \
                      U16& dr, U16& dg, U16& db, U16& da)
 
@@ -244,8 +244,8 @@ STAGE(invert, Ctx::None) {
 // ~~~~~~ Helpers for interacting with memory ~~~~~~ //
 
 template <typename T>
-SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, size_t x, size_t y) {
-    return (T*)ctx->pixels + y*ctx->stride + x;
+SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, size_t dx, size_t dy) {
+    return (T*)ctx->pixels + dy*ctx->stride + dx;
 }
 
 template <typename V, typename T>
@@ -369,23 +369,23 @@ SI void store_8888(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
 }
 
 STAGE(load_8888, const SkJumper_MemoryCtx* ctx) {
-    load_8888(ptr_at_xy<const uint32_t>(ctx, x,y), tail, &r,&g,&b,&a);
+    load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
 }
 STAGE(load_8888_dst, const SkJumper_MemoryCtx* ctx) {
-    load_8888(ptr_at_xy<const uint32_t>(ctx, x,y), tail, &dr,&dg,&db,&da);
+    load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
 }
 STAGE(store_8888, const SkJumper_MemoryCtx* ctx) {
-    store_8888(ptr_at_xy<uint32_t>(ctx, x,y), tail, r,g,b,a);
+    store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a);
 }
 
 STAGE(load_bgra, const SkJumper_MemoryCtx* ctx) {
-    load_8888(ptr_at_xy<const uint32_t>(ctx, x,y), tail, &b,&g,&r,&a);
+    load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &b,&g,&r,&a);
 }
 STAGE(load_bgra_dst, const SkJumper_MemoryCtx* ctx) {
-    load_8888(ptr_at_xy<const uint32_t>(ctx, x,y), tail, &db,&dg,&dr,&da);
+    load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &db,&dg,&dr,&da);
 }
 STAGE(store_bgra, const SkJumper_MemoryCtx* ctx) {
-    store_8888(ptr_at_xy<uint32_t>(ctx, x,y), tail, b,g,r,a);
+    store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, b,g,r,a);
 }
 
 // ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
@@ -414,15 +414,15 @@ SI void store_565(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) {
 }
 
 STAGE(load_565, const SkJumper_MemoryCtx* ctx) {
-    load_565(ptr_at_xy<const uint16_t>(ctx, x,y), tail, &r,&g,&b);
+    load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b);
     a = 255;
 }
 STAGE(load_565_dst, const SkJumper_MemoryCtx* ctx) {
-    load_565(ptr_at_xy<const uint16_t>(ctx, x,y), tail, &dr,&dg,&db);
+    load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db);
     da = 255;
 }
 STAGE(store_565, const SkJumper_MemoryCtx* ctx) {
-    store_565(ptr_at_xy<uint16_t>(ctx, x,y), tail, r,g,b);
+    store_565(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b);
 }
 
 // ~~~~~~ 8-bit memory loads and stores ~~~~~~ //
@@ -436,22 +436,22 @@ SI void store_8(uint8_t* ptr, size_t tail, U16 v) {
 
 STAGE(load_a8, const SkJumper_MemoryCtx* ctx) {
     r = g = b = 0;
-    a = load_8(ptr_at_xy<const uint8_t>(ctx, x,y), tail);
+    a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
 }
 STAGE(load_a8_dst, const SkJumper_MemoryCtx* ctx) {
     dr = dg = db = 0;
-    da = load_8(ptr_at_xy<const uint8_t>(ctx, x,y), tail);
+    da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
 }
 STAGE(store_a8, const SkJumper_MemoryCtx* ctx) {
-    store_8(ptr_at_xy<uint8_t>(ctx, x,y), tail, a);
+    store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a);
 }
 
 STAGE(load_g8, const SkJumper_MemoryCtx* ctx) {
-    r = g = b = load_8(ptr_at_xy<const uint8_t>(ctx, x,y), tail);
+    r = g = b = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
     a = 255;
 }
 STAGE(load_g8_dst, const SkJumper_MemoryCtx* ctx) {
-    dr = dg = db = load_8(ptr_at_xy<const uint8_t>(ctx, x,y), tail);
+    dr = dg = db = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
     da = 255;
 }
 STAGE(luminance_to_alpha, Ctx::None) {
@@ -477,14 +477,14 @@ STAGE(lerp_1_float, const float* f) {
 }
 
 STAGE(scale_u8, const SkJumper_MemoryCtx* ctx) {
-    U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, x,y), tail);
+    U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
     r = div255( r * c );
     g = div255( g * c );
     b = div255( b * c );
     a = div255( a * c );
 }
 STAGE(lerp_u8, const SkJumper_MemoryCtx* ctx) {
-    U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, x,y), tail);
+    U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
     r = lerp(dr, r, c);
     g = lerp(dg, g, c);
     b = lerp(db, b, c);
@@ -498,7 +498,7 @@ SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) {
 }
 STAGE(scale_565, const SkJumper_MemoryCtx* ctx) {
     U16 cr,cg,cb;
-    load_565(ptr_at_xy<const uint16_t>(ctx, x,y), tail, &cr,&cg,&cb);
+    load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
     U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
 
     r = div255( r * cr );
@@ -508,7 +508,7 @@ STAGE(scale_565, const SkJumper_MemoryCtx* ctx) {
 }
 STAGE(lerp_565, const SkJumper_MemoryCtx* ctx) {
     U16 cr,cg,cb;
-    load_565(ptr_at_xy<const uint16_t>(ctx, x,y), tail, &cr,&cg,&cb);
+    load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
     U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
 
     r = lerp(dr, r, cr);
@@ -520,7 +520,7 @@ STAGE(lerp_565, const SkJumper_MemoryCtx* ctx) {
 // ~~~~~~ Compound stages ~~~~~~ //
 
 STAGE(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) {
-    auto ptr = ptr_at_xy<uint32_t>(ctx, x,y);
+    auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
 
     load_8888(ptr, tail, &dr,&dg,&db,&da);
     r = r + div255( dr*inv(a) );