1 files changed, 2165 insertions, 2165 deletions
diff --git a/src/compute/skc/platforms/cl_12/kernels/render.cl b/src/compute/skc/platforms/cl_12/kernels/render.cl
index 9205334940..a7b32299c9 100644
--- a/src/compute/skc/platforms/cl_12/kernels/render.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/render.cl
@@ -1,2165 +1,2165 @@
-/*
- * Copyright 2016 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "tile.h"
-#include "block.h"
-#include "styling_types.h"
-#include "atomic_cl.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define SKC_RENDER_SUBGROUP_MASK  (SKC_RENDER_SUBGROUP_SIZE - 1)
-
-//
-//
-//
-
-#if   ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_1()
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      0
-
-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 2 )
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_2()
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      1
-
-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 4 )
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_4()
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      3
-
-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 8 )
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_8()
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      7
-
-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 16)
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_16()
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      15
-#endif
-
-//
-// tile state flag bits
-//
-
-typedef enum skc_tile_flags_e {
-
-  // FLUSH
-  SKC_TILE_FLAGS_FLUSH_FINALIZE    = 0x00000001,
-  SKC_TILE_FLAGS_FLUSH_UNWIND      = 0x00000002,
-  SKC_TILE_FLAGS_FLUSH_COMPLETE    = 0x00000004,
-
-  // OPACITY
-  SKC_TILE_FLAGS_SCATTER_SKIP      = 0x00000008,
-
-  //
-  // Note: testing for opacity and skipping scattering is on its way
-  // to becoming a much more programmable option because sometimes we
-  // may be compositing/blending from back-to-front and/or be using
-  // group blend rules that ignore opacity.
-  //
-  // The point is that all of these decisions should be encoded in
-  // styling commands and, as much as possible, removed from the final
-  // group/layer styling traversal render loop.
-  //
-
-} skc_tile_flags_e;
-
-//
-// COVER -- assumes availability of either fp16 or fp32
-//
-
-union skc_tile_cover
-{
-  struct {
-    SKC_RENDER_TILE_COVER             c[SKC_TILE_WIDTH];
-  } aN;
-
-#ifdef SKC_RENDER_TILE_COVER_VECTOR
-  struct {
-    SKC_RENDER_TILE_COVER_VECTOR      c[SKC_RENDER_TILE_COVER_VECTOR_COUNT];
-  } vN;
-#endif
-};
-
-//
-// COLOR -- assumes availability of either fp16 or fp32
-//
-
-union skc_tile_color
-{
-  union {
-    struct {
-      SKC_RENDER_TILE_COLOR           r;
-      SKC_RENDER_TILE_COLOR           g;
-      SKC_RENDER_TILE_COLOR           b;
-      SKC_RENDER_TILE_COLOR           a;
-    } rgba[SKC_TILE_WIDTH];
-  } aN;
-
-#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
-  union {
-    SKC_RENDER_TILE_COLOR_INTERLEAVED rgba[SKC_TILE_WIDTH];
-  } iN;
-#endif
-
-#ifdef SKC_RENDER_TILE_COLOR_VECTOR
-  union {
-    SKC_RENDER_TILE_COLOR_VECTOR      rgba[SKC_RENDER_TILE_COLOR_VECTOR_COUNT];
-  } vN;
-#endif
-
-  struct {
-    union {
-      struct {
-        SKC_RENDER_TILE_COLOR         r;
-        SKC_RENDER_TILE_COLOR         g;
-      };
-      SKC_RENDER_GRADIENT_FLOAT       distance;
-    };
-    union {
-      struct {
-        SKC_RENDER_TILE_COLOR         b;
-        SKC_RENDER_TILE_COLOR         a;
-      };
-      SKC_RENDER_GRADIENT_FLOAT       stoplerp;
-    };
-  } grad[SKC_TILE_WIDTH];
-};
-
-//
-// SHARED MEMORY STATE
-//
-
-#define SKC_RENDER_TILE_SMEM_WORDS ((SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT)
-
-#define SKC_RENDER_WIDE_AA_BYTES   (SKC_RENDER_TILE_SMEM_WORDS * sizeof(int) / SKC_RENDER_SUBGROUP_SIZE)
-#define SKC_RENDER_WIDE_AA_WIDTH   (SKC_RENDER_WIDE_AA_BYTES / sizeof(SKC_RENDER_WIDE_AA))
-
-//
-//
-//
-
-union skc_subgroup_smem
-{
-  //
-  // The tiles are stored in column-major / height-major order
-  //
-  // The final column is a guard column that is OK to write to but
-  // will never be read.  It simplifies the TTSB scatter but could be
-  // predicated if SMEM is really at a premium.
-  //
-#if ( SKC_RENDER_SUBGROUP_SIZE > 1 )
-  struct {
-    SKC_ATOMIC_UINT              area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
-  } atomic;
-#endif
-
-  struct {
-    int                          area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
-  } aN;
-
-  struct { // assumption is that height = subgroup
-    SKC_RENDER_AREA_V            area[SKC_TILE_WIDTH + 1][SKC_RENDER_SUBGROUP_SIZE];
-  } vN;
-
-  struct { // assumption is that height = subgroup
-    SKC_RENDER_WIDE_AA           area[SKC_RENDER_WIDE_AA_WIDTH][SKC_RENDER_SUBGROUP_SIZE];
-  } wide;
-
-  union skc_styling_cmd          cmds[(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT];
-
-  half                           gc  [(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT * 2];
-
-#if 0
-  //
-  // SPILL TO GMEM
-  //
-#if (SKC_REGS_COLOR_S > 0) || (SKC_REGS_COVER_S > 0)
-  struct {
-
-#if (SKC_REGS_COLOR_S > 0)
-    union skc_color_r            color[SKC_REGS_COLOR_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
-#endif
-
-#if (SKC_REGS_COVER_S > 0)
-    union float                  cover[SKC_REGS_COVER_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
-#endif
-
-  } regs;
-#endif
-  //
-  //
-  //
-#endif
-};
-
-//
-//
-//
-
-#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
-
-#define skc_subgroup_lane()  0
-
-#else
-
-#define skc_subgroup_lane()  get_sub_group_local_id()
-
-#endif
-
-//
-//
-//
-
-typedef skc_uint  skc_ttsk_lo_t;
-typedef skc_uint  skc_ttsk_hi_t;
-
-typedef skc_uint  skc_ttpk_lo_t;
-typedef skc_uint  skc_ttpk_hi_t;
-
-typedef skc_uint  skc_ttxk_lo_t;
-typedef skc_uint  skc_ttxk_hi_t;
-
-typedef skc_uint  skc_ttck_lo_t;
-typedef skc_uint  skc_ttck_hi_t;
-
-typedef skc_uint2 skc_ttck_t;
-
-typedef skc_int   skc_ttxb_t;
-
-//
-// TTCK (32-BIT COMPARE) v1:
-//
-//  0                                                           63
-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
-//  +----------------------+--------+--------+-------+-----+-----+
-//  |          30          |    1   |    1   |   18  |  7  |  7  |
-//
-//
-// TTCK (32-BIT COMPARE) v2:
-//
-//  0                                                           63
-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
-//  +----------------------+--------+--------+-------+-----+-----+
-//  |          30          |    1   |    1   |   15  |  9  |  8  |
-//
-//
-// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
-//
-//  0                                                           63
-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
-//  +----------------------+--------+--------+-------+-----+-----+
-//  |          27          |    1   |    1   |   18  |  9  |  8  |
-//
-
-static
-skc_uint
-skc_ttck_lo_get_ttxb_id(skc_ttck_lo_t const a)
-{
-  return a & SKC_TTCK_LO_MASK_ID;
-}
-
-static
-skc_layer_id
-skc_ttck_get_layer(skc_ttck_t const a)
-{
-  //
-  // FIXME -- a union with a ulong and a shift down and mask is
-  // probably faster on some architectures
-  //
-  skc_uint const lo = (a.lo >> SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
-  skc_uint const hi = (a.hi  & SKC_TTCK_HI_MASK_LAYER) << SKC_TTCK_LO_BITS_LAYER;
-
-  return lo | hi;
-}
-
-static
-skc_uint
-skc_ttck_hi_get_x(skc_ttck_hi_t const a)
-{
-  return SKC_BFE(a,SKC_TTCK_HI_BITS_X,SKC_TTCK_HI_OFFSET_X);
-}
-
-static
-skc_uint
-skc_ttck_hi_get_y(skc_ttck_hi_t const a)
-{
-  return a >> SKC_TTCK_HI_OFFSET_Y;
-}
-
-static
-skc_bool
-skc_ttck_equal_yxl(skc_ttck_t const a, skc_ttck_t const b)
-{
-  skc_uint const lo = (a.lo ^ b.lo) & SKC_BITS_TO_MASK_AT(SKC_TTCK_LO_BITS_LAYER,SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
-  skc_uint const hi = (a.hi ^ b.hi);
-
-  return (lo | hi) == 0;
-}
-
-static
-skc_bool
-skc_ttck_hi_equal_yx(skc_ttck_hi_t const a, skc_ttck_hi_t const b)
-{
-  return ((a ^ b) & SKC_TTCK_HI_MASK_YX) == 0;
-}
-
-static
-skc_bool
-skc_ttck_lo_is_prefix(skc_ttck_lo_t const a)
-{
-  return (a & SKC_TTCK_LO_MASK_PREFIX) != 0;
-}
-
-//
-// TILE TRACE SUBPIXEL
-//
-// The subpixels are encoded with either absolute tile coordinates
-// (32-bits) or packed in delta-encoded form form.
-//
-// For 32-bit subpixel packing of a 32x32 tile:
-//
-// A tile X is encoded as:
-//
-//   TX : 10 : unsigned min(x0,x1) tile subpixel coordinate.
-//
-//   SX :  6 : unsigned subpixel span from min to max x with range
-//             [0,32]. The original direction is not captured. Would
-//             be nice to capture dx but not necessary right now but
-//             could be in the future. <--- SPARE VALUES AVAILABLE
-//
-// A tile Y is encoded as:
-//
-//   TY : 10 : unsigned min(y0,y1) tile subpixel coordinate.
-//
-//   DY :  6 : signed subpixel delta y1-y0. The range of delta is
-//             [-32,32] but horizontal lines are not encoded so [1,32]
-//             is mapped to [0,31]. The resulting range [-32,31] fits
-//             in 6 bits.
-//
-// TTS:
-//
-//  0                        31
-//  |  TX |  SX  |  TY |  DY  |
-//  +-----+------+-----+------+
-//  |  10 |   6  |  10 |   6  |
-//
-
-static
-SKC_RENDER_TTS_V_BITFIELD
-skc_tts_get_ty_pixel_v(SKC_RENDER_TTS_V const a)
-{
-  //
-  // extract the whole pixel y coordinate
-  //
-  return SKC_BFE(a,
-                 SKC_TTS_BITS_TY   - SKC_SUBPIXEL_RESL_Y_LOG2,
-                 SKC_TTS_OFFSET_TY + SKC_SUBPIXEL_RESL_Y_LOG2);
-}
-
-static
-SKC_RENDER_TTS_V_BITFIELD
-skc_tts_get_xy_idx_v(SKC_RENDER_TTS_V const a)
-{
-  //
-  // get the linear array tile index of the pixel
-  //
-  return (((a & SKC_TTS_MASK_TX_PIXEL)
-
-#if   (SKC_SUBPIXEL_RESL_X_LOG2 > SKC_TILE_HEIGHT_LOG2)
-           >> (SKC_SUBPIXEL_RESL_X_LOG2 - SKC_TILE_HEIGHT_LOG2)
-#elif (SKC_SUBPIXEL_RESL_X_LOG2 < SKC_TILE_HEIGHT_LOG2)
-           << (SKC_TILE_HEIGHT_LOG2     - SKC_SUBPIXEL_RESL_X_LOG2)
-#endif
-
-           ) | skc_tts_get_ty_pixel_v(a));
-}
-
-#if 0
-static
-skc_ttx_v_s32_t
-skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
-{
-  skc_ttx_v_s32_t const dy = SKC_AS(skc_ttx_v_s32_t)a >> SKC_TTS_OFFSET_DY;
-
-  return (dy + SKC_AS(skc_ttx_v_s32_t)(~a >> 31));
-}
-#else
-static
-SKC_RENDER_TTS_V_BITFIELD
-skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
-{
-  SKC_RENDER_TTS_V_BITFIELD const dy = a >> SKC_TTS_OFFSET_DY;
-
-  return dy - (~a >> 31);
-}
-#endif
-
-static
-SKC_RENDER_TTS_V_BITFIELD
-skc_tts_get_tx_subpixel_v(SKC_RENDER_TTS_V const a)
-{
-  return a & SKC_BITS_TO_MASK(SKC_SUBPIXEL_RESL_X_LOG2);
-}
-
-static
-SKC_RENDER_TTS_V_BITFIELD
-skc_tts_get_sx_v(SKC_RENDER_TTS_V const a)
-{
-  return SKC_BFE(a,SKC_TTS_BITS_SX,SKC_TTS_OFFSET_SX);
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_aa_zero(__local union skc_subgroup_smem * SKC_RESTRICT const smem)
-{
-  //
-  // SIMD / CPU
-  //
-  //      &
-  //
-  // SIMT / GPU
-  //
-  // Note that atomic_init() is likely implemented as a simple
-  // assignment so there is no identifiable performance difference on
-  // current targets.
-  //
-  // If such an architecture appears in the future then we'll probably
-  // still want to implement this zero'ing operation as below but
-  // follow with an appropriate fence that occurs before any scatter
-  // operations.
-  //
-  // The baroque expansion below improves performance on Intel GEN by,
-  // presumably, achieving the 64-byte per clock SLM write as well as
-  // minimizing the overall number of SEND() block initializations and
-  // launches.
-  //
-  // Intel GENx has a documented 64 byte per cycle SLM write limit.
-  // So having each lane in an 8 lane subgroup zero-write 8 bytes is
-  // probably a safe bet (Later: benchmarking backs this up!).
-  //
-  // Note there is no reason at this time to unroll this loop.
-  //
-  for (uint ii=0; ii<SKC_RENDER_WIDE_AA_WIDTH; ii++)
-    smem->wide.area[ii][skc_subgroup_lane()] = ( 0 );
-}
-
-//
-// Note this is going to be vectorizable on most architectures.
-//
-// The return of the key translation feature might complicate things.
-//
-
-static
-void
-skc_scatter_ttpb(__global skc_ttxb_t        const * SKC_RESTRICT const ttxb_extent,
-                 __local  union skc_subgroup_smem * SKC_RESTRICT const smem,
-                 skc_block_id_t                                  const pb_id)
-{
-  skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();
-
-#if   ( SKC_TILE_RATIO == 1 )
-
-  SKC_RENDER_TTP_V const ttp_v = ttxb_extent[offset];
-
-#elif ( SKC_TILE_RATIO == 2 )
-
-  SKC_RENDER_TTP_V const ttp_v = vload2(offset,ttxb_extent);
-
-#else
-
-#error("tile ratio greater than 2 not supported")
-
-#endif
-
-  //
-  // Note there is no need to use an atomic for this operation on the
-  // current group of target platforms... but this may change if
-  // atomic ops truly go through a different path.
-  //
-  // As noted above, this direct increment is probably faster and can
-  // always be followed by a fence.
-  //
-  // Furthermore, note that the key sorting orders all ttck keys
-  // before ttpk keys.
-  //
-
-  //
-  // FIXME -- if the SMEM store is wider than bank word count then we
-  // might want to odd-even interleave the TTP values if the target
-  // device can't handle 64-bit stores
-  //
-
-  //
-  // skipping per-key translation for now
-  //
-  smem->vN.area[0][skc_subgroup_lane()] += ttp_v << (SKC_SUBPIXEL_RESL_X_LOG2 + 1);
-}
-
-//
-// Note that skc_scatter_ttsb is *not* vectorizable unless the
-// architecture supports a "scatter-add" capability.  All relevant
-// GPUs support atomic add on shared/local memory and thus support
-// scatter-add.
-//
-
-static
-void
-skc_scatter_ttsb(__global skc_ttxb_t        const * SKC_RESTRICT const ttxb_extent,
-                 __local  union skc_subgroup_smem * SKC_RESTRICT const smem,
-                 skc_block_id_t                                  const sb_id)
-{
-  skc_uint         const offset = sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
-
-  SKC_RENDER_TTS_V const tts_v  = ttxb_extent[offset];
-
-  //
-  // Skipping per-key translation for now
-  //
-
-  // Index into tile
-  //
-  // The tiles are stored in column-major / height-major order
-  //
-  // The final column is a guard column that is OK to write to but
-  // will never be read.  It simplifies the TTSB scatter but could be
-  // predicated if SMEM is really at a premium.
-  //
-
-  SKC_RENDER_TTS_V_BITFIELD const xy_idx = skc_tts_get_xy_idx_v(tts_v);
-
-#if 0
-  if (tts_v != SKC_TTS_INVALID)
-    printf("(%08X) = %u\n",tts_v,xy_idx);
-#endif
-
-  //
-  // adjust subpixel range to max y
-  //
-  // range is stored as [-32,31] and when read [0,31] is mapped to
-  // [1,32] because a dy of 0 is not possible.
-  //
-  // more succinctly: if dy >= 0 then ++dy
-  //
-  SKC_RENDER_TTS_V_BITFIELD const dy     = skc_tts_get_dy_v(tts_v);
-
-  //
-  // FIXME -- benchmark performance of setting dy to 0 if ttsv.vN is invalid?
-  //
-
-  // this "min(x0) * 2 + dx" is equivalent to "x0 + x1"
-  SKC_RENDER_TTS_V_BITFIELD const widths = skc_tts_get_tx_subpixel_v(tts_v) * 2 + skc_tts_get_sx_v(tts_v);
-
-  // Calculate left and right coverage contribution trapezoids
-  SKC_RENDER_TTS_V_BITFIELD const left   = dy * widths;
-  SKC_RENDER_TTS_V_BITFIELD const right  = (dy << (SKC_SUBPIXEL_RESL_X_LOG2 + 1)) - left;
-
-  //
-  // Accumulate altitudes and areas
-  //
-  // Optimization: if the device supports an CPU/SIMD vector-add or
-  // GPU/SIMT scatter-add atomic int2 add operation then placing the
-  // ALT and AREA values side-by-side would halve the number of
-  // additions.
-  //
-#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
-  //
-  // CPU/SIMD
-  //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A)                                 \
-  if (tts_v C != SKC_TTS_INVALID) {                             \
-    smem->aN.area[SKC_TILE_HEIGHT + xy_idx C] += left  C;       \
-    smem->aN.area[                  xy_idx C] += right C;       \
-  }
-
-#else
-  //
-  // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD
-  //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A)                                         \
-  if (tts_v C != SKC_TTS_INVALID) {                                     \
-    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area +           \
-                                          SKC_TILE_HEIGHT   + xy_idx C, \
-                                          left C);                      \
-    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + xy_idx C, \
-                                          right C);                     \
-  }
-#endif
-
-  SKC_RENDER_TTSB_EXPAND();
-}
-
-//
-// Note that 2048.0 can be represented exactly with fp16... fortuitous!
-//
-
-#define SKC_RENDER_FILL_MAX_AREA          (2u * SKC_SUBPIXEL_RESL_X * SKC_SUBPIXEL_RESL_Y)
-#define SKC_RENDER_FILL_MAX_AREA_2        (2u * SKC_RENDER_FILL_MAX_AREA)
-#define SKC_RENDER_FILL_EVEN_ODD_MASK     (SKC_RENDER_FILL_MAX_AREA_2 - 1)
-#define SKC_RENDER_FILL_MAX_AREA_RCP_F32  (SKC_RENDER_TILE_COVER)(1.0f / SKC_RENDER_FILL_MAX_AREA)
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_nonzero(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
-                       union skc_tile_cover            * SKC_RESTRICT const cover,
-                       union skc_tile_color            * SKC_RESTRICT const color)
-{
-  SKC_RENDER_ACC_COVER_INT area = 0;
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      area                                   += smem->vN.area[ii][skc_subgroup_lane()];
-      SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
-      SKC_RENDER_TILE_COVER     const nonzero = SKC_CONVERT(SKC_RENDER_TILE_COVER)(min(trapabs,SKC_RENDER_FILL_MAX_AREA));
-
-      cover->aN.c[ii] = nonzero * (SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA_RCP_F32);
-    }
-}
-
-static
-void
-skc_tile_cover_evenodd(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
-                       union skc_tile_cover            * SKC_RESTRICT const cover,
-                       union skc_tile_color            * SKC_RESTRICT const color)
-{
-  SKC_RENDER_ACC_COVER_INT area = 0;
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      area                                   += smem->vN.area[ii][skc_subgroup_lane()];
-      SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
-      SKC_RENDER_ACC_COVER_UINT const reflect = abs(SKC_AS(SKC_RENDER_ACC_COVER_INT)((trapabs & SKC_RENDER_FILL_EVEN_ODD_MASK) - SKC_RENDER_FILL_MAX_AREA));
-
-      cover->aN.c[ii] = SKC_CONVERT(SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA - reflect) * (SKC_RENDER_TILE_COVER)SKC_RENDER_FILL_MAX_AREA_RCP_F32;
-    }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_color_fill_solid(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
-                          uint                                 * SKC_RESTRICT const cmd_next,
-                          union skc_tile_color                 * SKC_RESTRICT const color)
-{
-  //
-  // rgba = solid fill
-  //
-  __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;
-
-  *cmd_next += 2;
-
-#if !defined( SKC_RENDER_TILE_COLOR_VECTOR )
-
-  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    color->aN.rgba[ii].r = rg.lo;
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    color->aN.rgba[ii].g = rg.hi;
-
-  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    color->aN.rgba[ii].b = ba.lo;
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    color->aN.rgba[ii].a = ba.hi;
-
-#else
-
-  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
-  SKC_RENDER_TILE_COLOR      const r  = rg.lo;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].even.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(r);
-
-  SKC_RENDER_TILE_COLOR      const g  = rg.hi;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].odd.even  = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(g);
-
-  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
-  SKC_RENDER_TILE_COLOR      const b  = ba.lo;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].even.odd  = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(b);
-
-  SKC_RENDER_TILE_COLOR      const a  = ba.hi;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].odd.odd   = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(a);
-
-#endif
-}
-
-//
-// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
-//
-// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
-//
-// Lerp in two fma/mad ops:
-//
-//    t * b + ((-t) * a + a)
-//
-// Note: OpenCL documents mix() as being implemented as:
-//
-//    a + (b - a) * t
-//
-// But this may be a native instruction on some devices.  For example,
-// on GEN9 there is an LRP "linear interoplation" function but it
-// doesn't appear to support half floats.
-//
-
-#if 1
-#define SKC_LERP(a,b,t)  mad(t,b,mad(-(t),a,a))
-#else
-#define SKC_LERP(a,b,t)  mix(a,b,t)
-#endif
-
-//
-// CPUs have a mock local address space so copying the gradient header
-// is probably not useful.  Just read directly from global.
-//
-
-#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
-#define SKC_RENDER_GRADIENT_SPACE  __local
-#else
-#define SKC_RENDER_GRADIENT_SPACE  __global
-#endif
-
-//
-// gradient is non-vertical
-//
-// removed the vertical (actually, horizontal) special case
-//
-
-static
-void
-skc_tile_color_fill_gradient_linear_nonvertical(__local  union skc_subgroup_smem     * SKC_RESTRICT const smem,
-                                                __global union skc_styling_cmd const * SKC_RESTRICT const commands,
-                                                uint                                 * SKC_RESTRICT const cmd_next,
-                                                union skc_tile_color                 * SKC_RESTRICT const color,
-                                                skc_ttck_hi_t                                       const ttck_hi)
-{
-  //
-  // Where is this tile?
-  //
-  // Note that the gradient is being sampled from pixel centers.
-  //
-  SKC_RENDER_GRADIENT_FLOAT const y =
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) I##.5f P
-    (SKC_RENDER_GRADIENT_FLOAT)( SKC_RENDER_SCANLINE_VECTOR_EXPAND() ) +
-    (skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE));
-
-  float                     const x = 0.5f + (skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH);
-
-  //
-  // Get starting numerator and denominator
-  //
-  // Note: if gh[0].dx is exactly 0.0f then this is a vertical
-  // gradient and can be handled by a special opcode.
-  //
-  // Note: the mad() ordering is slightly different than the original
-  // CUDA implementation.
-  //
-  union skc_gradient_vector const gv       = { vload4(0,&commands[*cmd_next].f32) };
-
-  *cmd_next += 4;
-
-  float                     const gv_x_dot = mad(x,gv.dx,gv.p0);
-  SKC_RENDER_GRADIENT_FLOAT const gv_numer = mad(y,gv.dy,gv_x_dot);
-
-  //
-  // Where are columns along gradient vector?
-  //
-  // TODO: Note that the gv_denom isn't multiplied through.
-  //
-  // Please doublecheck this... but I recall that in certain cases
-  // this wipes out some precision and results in minor but noticeable
-  // gradient artifacts.
-  //
-  // All arguments are scalars except gv_numer so a simpler
-  // evaluation might save some flops.
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    color->grad[ii].distance = mad(gv.dx,(float)ii,gv_numer) * gv.denom;
-
-  //
-  // is gradient non-repeating, repeating or reflecting?
-  //
-  switch (commands[(*cmd_next)++].u32)
-    {
-    case SKC_STYLING_GRADIENT_TYPE_LINEAR_NON_REPEATING:
-      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-        color->grad[ii].distance = clamp(color->grad[ii].distance,0.0f,1.0f);
-      break;
-
-    case SKC_STYLING_GRADIENT_TYPE_LINEAR_REPEATING:
-      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-        color->grad[ii].distance -= floor(color->grad[ii].distance);
-      break;
-
-    default: // PXL_STYLING_GRADIENT_TYPE_LINEAR_REFLECTING
-      //
-      // OPTIMIZATION: Can this be done in fewer than ~4 ops?
-      //
-      // Note: OpenCL "rint()" is round-to-nearest-even integer!
-      //
-      // Note: the floor() "round to -inf" op is implemented in the
-      // GEN op 'FRC' so probably don't use trunc() when floor will
-      // suffice.
-      //
-
-      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-        {
-          SKC_RENDER_GRADIENT_FLOAT dist_abs = fabs(color->grad[ii].distance);
-          color->grad[ii].distance = fabs(dist_abs - rint(dist_abs));
-        }
-    }
-
-  //
-  // initialize "stoplerp" for all columns
-  //
-  uint const slope_count = commands[(*cmd_next)++].u32;
-  uint const gd_n_v1     = commands[(*cmd_next)++].u32; // REMOVE ME
-
-  {
-    float const slope = commands[(*cmd_next)++].f32;
-
-    // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-    for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-      color->grad[ii].stoplerp = color->grad[ii].distance * slope;
-  }
-
-  //
-  // compute stoplerp for remaining stops
-  //
-  for (int jj=1; jj<slope_count; jj++)
-    {
-      float const floor = (float)jj;
-      float const slope = commands[(*cmd_next)++].f32;
-
-      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-        color->grad[ii].stoplerp = mad(min(0, color->grad[ii].stoplerp - floor),slope,color->grad[ii].stoplerp);
-    }
-
-  //
-  // copy gradient colors to local memory
-  //
-  uint const gd_n = slope_count + 1;
-
-#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
-  //
-  // copy entire gradient descriptor to local memory
-  //
-  for (uint ii=skc_subgroup_lane(); ii<gd_n*4; ii+=SKC_RENDER_SUBGROUP_SIZE)
-    smem->cmds[ii].u32 = commands[*cmd_next + ii].u32;
-
-  __local  half const * const SKC_RESTRICT gc = smem->gc + 0;
-#else
-  //
-  // prefetch entire gradient header
-  //
-  // no noticeable impact on performance
-  //
-  // prefetch(&commands[*cmd_next].u32,gh_words);
-  //
-  __global half const * const SKC_RESTRICT gc = commands[*cmd_next].f16a2 + 0;
-#endif
-
-  //
-  // adjust cmd_next so that V1 structure is consumed -- FIXME
-  //
-  *cmd_next += SKC_GRADIENT_CMD_WORDS_V2_ADJUST(gd_n_v1,gd_n);
-
-  //
-  // lerp between color pair stops
-  //
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      //
-      // Finally, we have the gradient stop index and the color stop
-      // pair lerp fraction
-      //
-      // Note that if these are vector values then a gather operation
-      // must occur -- there may be platforms (AVX-512?) that can
-      // perform an explicit gather on a vector type but it's not
-      // really expressible in OpenCL except implicitly with a
-      // workgroup of work items.
-      //
-      // ***********************
-      //
-      // FIXME -- USE HERB'S SINGLE FMA LERP
-      //
-      // ***********************
-      //
-      SKC_RENDER_GRADIENT_STOP const gc_stop = SKC_CONVERT(SKC_RENDER_GRADIENT_STOP)(color->grad[ii].stoplerp);
-      SKC_RENDER_GRADIENT_FRAC const gc_frac = SKC_CONVERT(SKC_RENDER_GRADIENT_FRAC)(color->grad[ii].stoplerp - floor(color->grad[ii].stoplerp));
-
-      {
-        SKC_RENDER_TILE_COLOR lo, hi;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
-          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + 0,gc); \
-          lo C                                = cc.lo;                  \
-          hi C                                = cc.hi;                  \
-        }
-
-        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
-
-        color->aN.rgba[ii].r = SKC_LERP(lo,hi,gc_frac);
-      }
-
-      //
-      //
-      //
-      {
-        SKC_RENDER_TILE_COLOR lo, hi;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
-          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n,gc); \
-          lo C                                = cc.lo;                  \
-          hi C                                = cc.hi;                  \
-        }
-
-        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
-
-        color->aN.rgba[ii].g = SKC_LERP(lo,hi,gc_frac);
-      }
-
-      //
-      //
-      //
-      {
-        SKC_RENDER_TILE_COLOR lo, hi;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
-          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*2,gc); \
-          lo C                                = cc.lo;                  \
-          hi C                                = cc.hi;                  \
-        }
-
-        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
-
-        color->aN.rgba[ii].b = SKC_LERP(lo,hi,gc_frac);
-      }
-
-      //
-      //
-      //
-      {
-        SKC_RENDER_TILE_COLOR lo, hi;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
-          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*3,gc); \
-          lo C                                = cc.lo;                  \
-          hi C                                = cc.hi;                  \
-        }
-
-        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
-
-        color->aN.rgba[ii].a = SKC_LERP(lo,hi,gc_frac);
-      }
-    }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_blend_over(union skc_tile_color       * SKC_RESTRICT const color_acc,
-                    union skc_tile_cover const * SKC_RESTRICT const cover_wip,
-                    union skc_tile_color const * SKC_RESTRICT const color_wip)
-{
-  //
-  // fralunco = cover.wip * acc.a
-  //
-  // acc.r    =  fralunco * wip.r + acc.r
-  // acc.g    =  fralunco * wip.g + acc.g
-  // acc.b    =  fralunco * wip.b + acc.b
-  // acc.a    = -fralunco * wip.a + acc.a
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      SKC_RENDER_TILE_COVER const fralunco = cover_wip->aN.c[ii] * color_acc->aN.rgba[ii].a;
-
-      color_acc->aN.rgba[ii].r = mad(+fralunco,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
-      color_acc->aN.rgba[ii].g = mad(+fralunco,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
-      color_acc->aN.rgba[ii].b = mad(+fralunco,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
-      color_acc->aN.rgba[ii].a = mad(-fralunco,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
-    }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_blend_plus(union skc_tile_color       * SKC_RESTRICT const color_acc,
-                    union skc_tile_cover const * SKC_RESTRICT const cover_wip,
-                    union skc_tile_color const * SKC_RESTRICT const color_wip)
-{
-  //
-  // cover_min = min(cover.wip,a.acc)
-  //
-  // r.acc =  cover_min * r.wip + r.acc
-  // g.acc =  cover_min * g.wip + g.acc
-  // b.acc =  cover_min * b.wip + b.acc
-  // a.acc = -cover_min * a.wip + a.acc
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      SKC_RENDER_TILE_COVER const cover_min = fmin(cover_wip->aN.c[ii],color_acc->aN.rgba[ii].a);
-
-      color_acc->aN.rgba[ii].r = mad(+cover_min,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
-      color_acc->aN.rgba[ii].g = mad(+cover_min,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
-      color_acc->aN.rgba[ii].b = mad(+cover_min,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
-      color_acc->aN.rgba[ii].a = mad(-cover_min,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
-    }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_blend_multiply(union skc_tile_color       * SKC_RESTRICT const color_acc,
-                        union skc_tile_cover const * SKC_RESTRICT const cover_wip,
-                        union skc_tile_color const * SKC_RESTRICT const color_wip)
-{
-  //
-  // r.acc = (cover.wip * r.wip) * r.acc
-  // g.acc = (cover.wip * g.wip) * g.acc
-  // b.acc = (cover.wip * b.wip) * b.acc
-  // a.acc = (cover.wip * a.wip) * (1.0 - a.acc) <-- a.acc is already (1.0 - alpha)
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      color_acc->aN.rgba[ii].r *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].r;
-      color_acc->aN.rgba[ii].g *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].g;
-      color_acc->aN.rgba[ii].b *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].b;
-      color_acc->aN.rgba[ii].a *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].a;
-    }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_blend_knockout(union skc_tile_cover       * SKC_RESTRICT const cover_acc,
-                        union skc_tile_color       * SKC_RESTRICT const color_acc,
-                        union skc_tile_cover const * SKC_RESTRICT const cover_wip,
-                        union skc_tile_color const * SKC_RESTRICT const color_wip)
-{
-  //
-  // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
-  // cover.acc         = cover.acc + cover.wip.contrib
-  //
-  // r.acc =  cover.wip.contrib * r.wip + r.acc
-  // g.acc =  cover.wip.contrib * g.wip + g.acc
-  // b.acc =  cover.wip.contrib * b.wip + b.acc
-  // a.acc = -cover.wip.contrib * a.wip * a.acc
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      SKC_RENDER_TILE_COVER const contrib = (1 - cover_acc->aN.c[ii]) * cover_wip->aN.c[ii];
-
-      cover_acc->aN.c[ii]     += contrib;
-
-      color_acc->aN.rgba[ii].r = mad(+contrib,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
-      color_acc->aN.rgba[ii].g = mad(+contrib,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
-      color_acc->aN.rgba[ii].b = mad(+contrib,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
-      color_acc->aN.rgba[ii].a = mad(-contrib,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
-    }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_msk_copy_wip(union skc_tile_cover       * SKC_RESTRICT const cover_msk,
-                            union skc_tile_cover const * SKC_RESTRICT const cover_wip)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover_msk->aN.c[ii] = cover_wip->aN.c[ii];
-
-#else
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
-    cover_msk->vN.c[ii] = cover_wip->vN.c[ii];
-
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_msk_copy_acc(union skc_tile_cover       * SKC_RESTRICT const cover_msk,
-                            union skc_tile_cover const * SKC_RESTRICT const cover_acc)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover_msk->aN.c[ii] = cover_acc->aN.c[ii];
-
-#else
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNTN)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
-    cover_msk->vN.c[ii] = cover_acc->vN.c[ii];
-
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_accumulate(union skc_tile_cover       * SKC_RESTRICT const cover_acc,
-                          union skc_tile_cover const * SKC_RESTRICT const cover_wip)
-{
-  //
-  // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
-  // cover.acc         = cover.acc + cover.wip.contrib
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover_acc->aN.c[ii] = mad(1 - cover_acc->aN.c[ii],cover_wip->aN.c[ii],cover_acc->aN.c[ii]);
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_wip_mask(union skc_tile_cover       * SKC_RESTRICT const cover_wip,
-                        union skc_tile_cover const * SKC_RESTRICT const cover_msk)
-{
-  //
-  // cover.wip *= cover.msk
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover_wip->aN.c[ii] *= cover_msk->aN.c[ii];
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_wip_zero(union skc_tile_cover * SKC_RESTRICT const cover)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover->aN.c[ii] = 0;
-
-#else
-  //
-  // GEN9 compiler underperforms on this
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
-    cover->vN.c[ii] = 0;
-
-#endif
-}
-
-static
-void
-skc_tile_cover_acc_zero(union skc_tile_cover * SKC_RESTRICT const cover)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover->aN.c[ii] = 0;
-
-#else
-  //
-  // GEN9 compiler underperforms on this
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
-    cover->vN.c[ii] = 0;
-
-#endif
-}
-
-static
-void
-skc_tile_cover_msk_zero(union skc_tile_cover * SKC_RESTRICT const cover)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover->aN.c[ii] = 0;
-
-#else
-  //
-  // GEN9 compiler underperforms on this
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
-    cover->vN.c[ii] = 0;
-
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_msk_one(union skc_tile_cover * SKC_RESTRICT const cover)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover->aN.c[ii] = 1;
-
-#else
-  //
-  // GEN9 compiler underperforms on this
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
-    cover->vN.c[ii] = SKC_RENDER_TILE_COVER_VECTOR_ONE;
-
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_msk_invert(union skc_tile_cover * SKC_RESTRICT const cover)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    cover->aN.c[ii] = 1 - cover->aN.c[ii];
-
-#else
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
-    cover->vN.c[ii] = 1 - cover->vN.c[ii];
-
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_color_wip_zero(union skc_tile_color * SKC_RESTRICT const color)
-{
-#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      color->aN.rgba[ii].r = 0;
-      color->aN.rgba[ii].g = 0;
-      color->aN.rgba[ii].b = 0;
-      color->aN.rgba[ii].a = 1;
-    }
-
-#else
-  //
-  // DISABLED ON GEN9 -- probably a compiler bug
-  //
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].even.even = 0;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].odd.even  = 0;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].even.odd  = 0;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].odd.odd   = 1;
-#endif
-}
-
-static
-void
-skc_tile_color_acc_zero(union skc_tile_color * SKC_RESTRICT const color)
-{
-#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      color->aN.rgba[ii].r = 0;
-      color->aN.rgba[ii].g = 0;
-      color->aN.rgba[ii].b = 0;
-      color->aN.rgba[ii].a = 1;
-    }
-
-#else
-  //
-  // DISABLED ON GEN9 -- probably a compiler bug
-  //
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].even.even = 0;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].odd.even  = 0;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].even.odd  = 0;
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    color->vN.rgba[ii].odd.odd   = 1;
-#endif
-}
-
-//
-//
-//
-
-static
-bool
-skc_tile_color_test_opacity(union skc_tile_color const * SKC_RESTRICT const color)
-{
-  //
-  // returns true if tile is opaque
-  //
-  // various hacks to test for complete tile opacity
-  //
-  // note that front-to-back currently has alpha at 0.0f -- this can
-  // be harmonized to use a traditional alpha if we want to support
-  // rendering in either direction
-  //
-  // hack -- ADD/MAX/OR all alphas together and test for non-zero
-  //
-  SKC_RENDER_TILE_COLOR t = color->aN.rgba[0].a;
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
-  for (uint ii=1; ii<SKC_TILE_WIDTH; ii++)
-    t += color->aN.rgba[ii].a;
-
-#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-  return !any(t != ( 0 ));
-
-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
-  //
-  // SIMT - scalar per lane
-  //
-  return !sub_group_any(t != 0);
-
-#else
-  //
-  // SIMT - vector per lane
-  //
-  return !sub_group_any(any(t != ( 0 )));
-
-#endif
-
-  //
-  // TODO: The alternative vector-per-lane implementation below is
-  // *not* believed to be performant because the terse vector-wide
-  // test is just hiding a series of comparisons and is likely worse
-  // than the blind ADD/MAX/OR'ing of all alphas followed by a single
-  // test.
-  //
-#if 0
-  //
-  // SIMT - vector per lane
-  //
-
-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT-1)))
-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
-    {
-      if (sub_group_any(any(color->vN.ba[ii].a != ( 0 ))))
-        return false;
-    }
-
-  return true;
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_background_over(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
-                         uint                                 * SKC_RESTRICT const cmd_next,
-                         union skc_tile_color                 * SKC_RESTRICT const color)
-{
-  //
-  // acc.r = acc.a * r + acc.r
-  // acc.g = acc.a * g + acc.g
-  // acc.b = acc.a * b + acc.b
-  //
-  __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;
-
-  *cmd_next += 2;
-
-  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    color->aN.rgba[ii].r = mad(color->aN.rgba[ii].a,rg.lo,color->aN.rgba[ii].r);
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    color->aN.rgba[ii].g = mad(color->aN.rgba[ii].a,rg.hi,color->aN.rgba[ii].g);
-
-  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    color->aN.rgba[ii].b = mad(color->aN.rgba[ii].a,ba.lo,color->aN.rgba[ii].b);
-}
-
-//
-//
-//
-
-// #define SKC_SURFACE_IS_BUFFER
-#ifdef  SKC_SURFACE_IS_BUFFER
-
-static
-void
-skc_surface_composite_u8_rgba(__global SKC_RENDER_SURFACE_U8_RGBA * SKC_RESTRICT const surface,
-                              skc_uint                                           const surface_pitch,
-                              union skc_tile_color          const * SKC_RESTRICT const color,
-                              skc_ttck_hi_t                                      const ttck_hi)
-{
-  //
-  // NEW MAJOR OPTIMIZATION:
-  //
-  // Rotating and rasterizing the original world transform by -90
-  // degrees and then rendering the scene scene by +90 degrees enables
-  // all the final surface composite to be perfomed in perfectly
-  // coalesced wide transactions.
-  //
-  // For this reason, linear access to the framebuffer is preferred.
-  //
-  // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
-  //
-  // NOTE THIS IS TRANSPOSED BY 90 DEGREES
-  //
-  // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
-  // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
-  //
-  // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
-  // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
-  //
-  // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
-  // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
-  //
-  uint const pitch = surface_pitch / SKC_RENDER_SCANLINE_VECTOR_SIZE;
-  uint const x     = skc_ttck_hi_get_x(ttck_hi);
-  uint const y     = skc_ttck_hi_get_y(ttck_hi) ;
-  uint const base  = x * SKC_TILE_WIDTH * pitch + y * (SKC_TILE_HEIGHT / SKC_RENDER_SCANLINE_VECTOR_SIZE) + skc_subgroup_lane();
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-      SKC_RENDER_SURFACE_U8_RGBA rgba = ( 0xFF000000 );
-
-      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].r * 255);
-      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].g * 255) << 8;
-      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].b * 255) << 16;
-
-      surface[base + ii * pitch] = rgba;
-
-      // printf("%08v2X\n",rgba);
-    }
-}
-
-#else
-
-static
-void
-skc_surface_composite_u8_rgba(__write_only image2d_t                          surface,
-                              union skc_tile_color const * SKC_RESTRICT const color,
-                              skc_ttck_hi_t                                   const ttck_hi)
-{
-  //
-  // NEW MAJOR OPTIMIZATION:
-  //
-  // Rotating and rasterizing the original world transform by -90
-  // degrees and then rendering the scene scene by +90 degrees enables
-  // all the final surface composite to be perfomed in perfectly
-  // coalesced wide transactions.
-  //
-  // For this reason, linear access to the framebuffer is preferred.
-  //
-  // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
-  //
-  // NOTE THIS IS TRANSPOSED BY 90 DEGREES
-  //
-  // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
-  // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
-  //
-  // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
-  // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
-  //
-  // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
-  // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
-  //
-
-#if 1
-  int x = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;
-  int y = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);
-
-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-    {
-#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) {                       \
-        SKC_RENDER_SURFACE_WRITE(surface,               \
-                                 (int2)(x,y+I),         \
-                                 color->iN.rgba[ii] A); \
-      }
-
-#else
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) {                               \
-        SKC_RENDER_SURFACE_COLOR const rgba =                   \
-          (SKC_RENDER_SURFACE_COLOR)                            \
-          (color->aN.rgba[ii].r C,                              \
-           color->aN.rgba[ii].g C,                              \
-           color->aN.rgba[ii].b C,                              \
-           1.0);                                                \
-        SKC_RENDER_SURFACE_WRITE(surface,(int2)(x,y+I),rgba);   \
-      }
-
-#endif
-
-      SKC_RENDER_SCANLINE_VECTOR_EXPAND();
-
-      x += 1;
-    }
-#else
-    int x = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);
-    int y = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;
-
-    // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
-    for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
-      {
-#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) {                       \
-        SKC_RENDER_SURFACE_WRITE(surface,               \
-                                 (int2)(x+I,y+ii),      \
-                                 color->iN.rgba[ii] A); \
-      }
-
-#else
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) {                               \
-      SKC_RENDER_SURFACE_COLOR const rgba =                     \
-        (SKC_RENDER_SURFACE_COLOR)                              \
-        (color->aN.rgba[ii].r C,                                \
-        color->aN.rgba[ii].g C,                                 \
-        color->aN.rgba[ii].b C,                                 \
-        1.0);                                                   \
-      SKC_RENDER_SURFACE_WRITE(surface,(int2)(x+I,y+ii),rgba);  \
-    }
-
-#endif
-
-      SKC_RENDER_SCANLINE_VECTOR_EXPAND();
-    }
-
-#endif
-}
-
-#endif
-
-//
-//
-//
-static
-uint const
-skc_ttck_lane(uint const ttck_idx)
-{
-  return ttck_idx & SKC_RENDER_SUBGROUP_MASK;
-}
-
-//
-// RENDER KERNEL
-//
-
-__kernel
-SKC_RENDER_KERNEL_ATTRIBS
-void
-skc_kernel_render(__global   union  skc_layer_node   const * SKC_RESTRICT const layers,
-                  __global   struct skc_group_node   const * SKC_RESTRICT const groups,
-                  __global   union  skc_styling_cmd  const * SKC_RESTRICT const commands,     // FIXME -- rename
-
-                  __global   skc_ttck_t              const * SKC_RESTRICT const ttck_keys,    // rename: keys
-                  skc_uint                                                const ttck_count,   // rename: key_count
-
-                  __global   uint                    const * SKC_RESTRICT const ttck_offsets, // rename: offsets
-                  skc_uint                                                const tile_count,   // rename: offset_count
-
-                  __global   skc_ttxb_t              const * SKC_RESTRICT const ttxb_extent,
-#ifdef SKC_SURFACE_IS_BUFFER
-                  __global   void                          * SKC_RESTRICT const surface,
-#else
-                  __write_only image2d_t                                        surface,
-#endif
-#ifdef SKC_SURFACE_IS_BUFFER
-                  skc_uint                                                const surface_pitch,
-#endif
-                  uint4                                                   const tile_clip)    // rename: clip
-{
-  //
-  // Each subgroup is responsible for a tile.  No extra subgroups are
-  // launched.
-  //
-  // FIXME -- might be better implemented as a "grid stride loop" if
-  // Intel GEN really has a local memory "quantum" of 4KB which means
-  // we would need to launch 4 subgroups per workgroup.
-  //
-  // Confirmed: GEN8 has 4KB SLM workgroup min while GEN9 is 1KB.
-  //
-
-  //
-  // declare tile cover and color registers
-  //
-  // this used to be a neat unified struct but the Intel GEN compiler
-  // wasn't cooperating and spilling to private memory even though all
-  // registers were indexed by constants
-  //
-  union skc_tile_color  color_wip;
-  union skc_tile_color  color_acc;
-
-  union skc_tile_cover  cover_wip;
-  union skc_tile_cover  cover_acc;
-  union skc_tile_cover  cover_msk;
-
-  //
-  // which subgroup in the grid is this?
-  //
-  // TAKE NOTE: the Intel GEN compiler is recognizing get_group_id(0)
-  // as a uniform but the alternative calculation used when there are
-  // multiple subgroups per workgroup is not cooperating and
-  // driving spillage elsewhere.
-  //
-#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
-  skc_uint const ttck_offset_idx = get_group_id(0);
-#else
-  skc_uint const ttck_offset_idx = get_group_id(0) * SKC_RENDER_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
-  //
-  // load the starting ttck for this offset and get a bound on the max
-  // number of keys that might be loaded
-  //
-  // these are uniform across all subgroup lanes
-  //
-  skc_uint ttck_idx = ttck_offsets[ttck_offset_idx];
-
-  //
-  // FIXME -- SIMD/CPU version should probaby load a 256-bit (4-wide)
-  // vector of ttck keys
-  //
-#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
-
-  skc_ttck_t ttck = ttck_keys[ttck_idx];
-
-#else
-
-  uint const ttck_base = ttck_idx & ~SKC_RENDER_SUBGROUP_MASK;
-  uint const ttck_lane = ttck_idx &  SKC_RENDER_SUBGROUP_MASK;
-  skc_ttck_t ttck_s    = ttck_keys[min(ttck_base+max(get_sub_group_local_id(),ttck_lane),ttck_count-1)]
-
-#endif
-
-  //
-  // set up style group/layer state
-  //
-  struct skc_styling_group {
-    union skc_group_range range;
-    skc_uint              depth;
-    skc_uint              id;
-  } group;
-
-  group.range.lo = 0;
-  group.range.hi = SKC_UINT_MAX;
-  group.depth    = 0;
-  group.id       = SKC_UINT_MAX;
-
-  //
-  // start with clear tile opacity, knockout and flag bits
-  //
-  // uint color_acc_opacity  = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
-  // uint cover_acc_knockout = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
-  //
-  skc_uint flags = 0;
-
-  //
-  // declare and initialize accumulators
-  //
-#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
-  __local union skc_subgroup_smem                      smem[1];
-#else
-  __local union skc_subgroup_smem                      smem_wg[SKC_RENDER_WORKGROUP_SUBGROUPS];
-  __local union skc_subgroup_smem * SKC_RESTRICT const smem = smem_wg + get_sub_group_id();
-#endif
-
-#ifdef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
-  //
-  // select the initial ttck key
-  //
-  skc_ttck_t ttck;
-#if 0
-  ttck    = sub_group_broadcast(ttck_s,ttck_lane);    // SHOULD WORK BUT .4454 COMPILER IS BROKEN
-#else
-  ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane); // EXPLICIT WORKAROUND
-  ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane);
-#endif
-
-#endif
-
-  //
-  // save the first key so we know what tile we're in
-  //
-  skc_ttck_t ttck0 = ttck;
-
-  //
-  // evaluate the coarse clip as late as possible
-  //
-  skc_uint const ttck_hi_x = skc_ttck_hi_get_x(ttck0.hi);
-
-  if ((ttck_hi_x < tile_clip.lo.x) || (ttck_hi_x >= tile_clip.hi.x))
-    return;
-
-  skc_uint const ttck_hi_y = skc_ttck_hi_get_y(ttck0.hi);
-
-  if ((ttck_hi_y < tile_clip.lo.y) || (ttck_hi_y >= tile_clip.hi.y))
-    return;
-
-#if 0
-  printf("< %u, %u >\n",ttck_hi_x,ttck_hi_y);
-#endif
-
-  //
-  // load -> scatter -> flush
-  //
-  while (true)
-    {
-      // if scattering is disabled then just run through ttck keys
-      bool const is_scatter_enabled = (flags & SKC_TILE_FLAGS_SCATTER_SKIP) == 0;
-
-      // need to clear accumulators before a scatter loop
-      if (is_scatter_enabled)
-        {
-          skc_tile_aa_zero(smem);
-        }
-
-      do {
-        // skip scattering?
-        if (is_scatter_enabled)
-          {
-            skc_block_id_t const xb_id = skc_ttck_lo_get_ttxb_id(ttck.lo);
-
-            if (skc_ttck_lo_is_prefix(ttck.lo)) {
-              skc_scatter_ttpb(ttxb_extent,smem,xb_id);
-            } else {
-              skc_scatter_ttsb(ttxb_extent,smem,xb_id);
-            }
-          }
-
-        //
-        // any ttck keys left?
-        //
-        if (++ttck_idx >= ttck_count)
-          {
-            flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
-            break;
-          }
-
-        //
-        // process next ttck key
-        //
-#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
-        //
-        // SIMD -- read next key
-        //
-        ttck = ttck_keys[ttck_idx];
-#else
-        //
-        // SIMT -- refresh the ttck_s?
-        //
-        uint const ttck_lane_next = ttck_idx & SKC_RENDER_SUBGROUP_MASK;
-
-        if (ttck_lane_next == 0)
-          ttck_s = ttck_keys[min(ttck_idx+get_sub_group_local_id(),ttck_count-1)];
-
-        //
-        // broadcast next key to entire subgroup
-        //
-#if 0
-        ttck    = sub_group_broadcast(ttck_s,ttck_lane_next);    // SHOULD WORK BUT .4454 COMPILER IS BROKEN
-#else
-        ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane_next); // EXPLICIT WORKAROUND
-        ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane_next);
-#endif
-#endif
-        // continue scattering if on same YXL layer
-      } while (skc_ttck_equal_yxl(ttck0,ttck));
-
-      // finalize if no longer on same YX tile
-      if (!skc_ttck_hi_equal_yx(ttck0.hi,ttck.hi))
-        {
-          // otherwise, unwind the tile styling and exit
-          flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
-        }
-
-      //
-      // given: new layer id from ttxk key
-      //
-      // load [layer id]{ group id, depth }
-      //
-      // if within current group's layer range
-      //
-      //   if at same depth
-      //
-      //     load and execute cover>[mask>]color>blend commands
-      //
-      //   else if not at same depth then move deeper
-      //
-      //     for all groups in group trail from cur depth to new depth
-      //       enter group, saving and initializing regs as necessary
-      //     increment depth and update layer range
-      //     load and execute cover>[mask>]color>blend commands
-      //
-      // else not within layer range
-      //
-      //   exit current group, restoring regs as necessary
-      //   decrement depth and update layer range
-      //
-      //
-      skc_layer_id         const layer_id_new   = skc_ttck_get_layer(ttck0); // FIXME -- this was ttck_hi
-      union skc_layer_node const layer_node_new = layers[layer_id_new];
-
-      // clear flag that controls group/layer traversal
-      flags &= ~SKC_TILE_FLAGS_FLUSH_COMPLETE;
-
-      do {
-        bool const unwind = (flags & SKC_TILE_FLAGS_FLUSH_UNWIND) != 0;
-
-        //
-        // is layer a child of the current parent group?
-        //
-        uint cmd_next = 0;
-
-        if (!unwind && (layer_node_new.parent == group.id))
-          {
-            // execute this layer's cmds
-            cmd_next = layer_node_new.cmds;
-
-            // if this is final then configure so groups get unwound, otherwise we're done
-            flags   |= ((flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) ? SKC_TILE_FLAGS_FLUSH_UNWIND : SKC_TILE_FLAGS_FLUSH_COMPLETE);
-          }
-        else if (!unwind && (layer_id_new >= group.range.lo && layer_id_new <= group.range.hi))
-          {
-            //
-            // is layer in a child group?
-            //
-            union skc_group_parents const gp = groups[layer_node_new.parent].parents;
-            uint                    const gn = gp.depth - ++group.depth;
-
-            if (gn == 0)
-              group.id = layer_node_new.parent;
-            else
-              group.id = commands[gp.base + gn - 1].parent;
-
-            // update group layer range
-            group.range = groups[group.id].range;
-
-            // enter current group
-            cmd_next    = groups[group.id].cmds.enter;
-          }
-        else // otherwise, exit this group
-          {
-            // enter current group
-            cmd_next = groups[group.id].cmds.leave;
-
-            // decrement group depth
-            if (--group.depth == 0)
-              {
-                flags |= SKC_TILE_FLAGS_FLUSH_COMPLETE;
-              }
-            else
-              {
-                // get path_base of current group
-                uint const gnpb = groups[group.id].parents.base;
-
-                // get parent of current group
-                group.id    = commands[gnpb].parent;
-
-                // update group layer range
-                group.range = groups[group.id].range;
-              }
-          }
-
-        //
-        // execute cmds
-        //
-        while (true)
-          {
-            union skc_styling_cmd const cmd = commands[cmd_next++];
-
-            switch (cmd.u32 & SKC_STYLING_OPCODE_MASK_OPCODE)
-              {
-              case SKC_STYLING_OPCODE_NOOP:
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_NONZERO:
-                skc_tile_cover_nonzero(smem,&cover_wip,&color_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_EVENODD:
-                skc_tile_cover_evenodd(smem,&cover_wip,&color_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_ACCUMULATE:
-                skc_tile_cover_accumulate(&cover_acc,&cover_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_MASK:
-                skc_tile_cover_wip_mask(&cover_wip,&cover_msk);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_WIP_ZERO:
-                skc_tile_cover_wip_zero(&cover_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_ACC_ZERO:
-                skc_tile_cover_acc_zero(&cover_acc);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_MASK_ZERO:
-                skc_tile_cover_msk_zero(&cover_msk);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_MASK_ONE:
-                skc_tile_cover_msk_one(&cover_msk);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_MASK_INVERT:
-                skc_tile_cover_msk_invert(&cover_msk);
-                break;
-
-              case SKC_STYLING_OPCODE_COLOR_FILL_SOLID:
-                skc_tile_color_fill_solid(commands,&cmd_next,&color_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR:
-                //
-                // FIXME -- gradients shouldn't be executing so much
-                // conditional driven code at runtime since we *know*
-                // the gradient style on the host can just create a
-                // new styling command to exploit this.
-                //
-                // FIXME -- it might be time to try using the GPU's
-                // sampler on a linear array of half4 vectors -- it
-                // might outperform the explicit load/lerp routines.
-                //
-                // FIXME -- optimizing for vertical gradients (uhhh,
-                // they're actually horizontal due to the -90 degree
-                // view transform) is nice but is it worthwhile to
-                // have this in the kernel?  Easy to add it back...
-                //
-#if defined( SKC_ARCH_GEN9 )
-                // disable gradients due to exessive spillage -- fix later
-                cmd_next += SKC_GRADIENT_CMD_WORDS_V1(commands[cmd_next+6].u32);
-#else
-                skc_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi);
-#endif
-                break;
-
-              case SKC_STYLING_OPCODE_COLOR_WIP_ZERO:
-                skc_tile_color_wip_zero(&color_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_COLOR_ACC_ZERO:
-                skc_tile_color_acc_zero(&color_acc);
-                break;
-
-              case SKC_STYLING_OPCODE_BLEND_OVER:
-                skc_tile_blend_over(&color_acc,&cover_wip,&color_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_BLEND_PLUS:
-                skc_tile_blend_plus(&color_acc,&cover_wip,&color_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_BLEND_MULTIPLY:
-                skc_tile_blend_multiply(&color_acc,&cover_wip,&color_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_BLEND_KNOCKOUT:
-                skc_tile_blend_knockout(&cover_acc,&color_acc,&cover_wip,&color_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK:
-                // skc_tile_cover_msk_copy_wip(&cover_msk,&cover_wip);
-                break;
-
-              case SKC_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK:
-                // skc_tile_cover_msk_copy_acc(&cover_msk,&cover_acc);
-                break;
-
-              case SKC_STYLING_OPCODE_BACKGROUND_OVER:
-                skc_tile_background_over(commands,&cmd_next,&color_acc);
-                break;
-
-              case SKC_STYLING_OPCODE_SURFACE_COMPOSITE:
-#ifdef SKC_SURFACE_IS_BUFFER
-                skc_surface_composite_u8_rgba(surface,surface_pitch,&color_acc,ttck0.hi);
-#else
-                skc_surface_composite_u8_rgba(surface,              &color_acc,ttck0.hi);
-#endif
-                break;
-
-              case SKC_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY:
-                if (skc_tile_color_test_opacity(&color_acc))
-                  flags |= SKC_TILE_FLAGS_SCATTER_SKIP;
-                break;
-
-              default:
-                return; // this is an illegal opcode -- trap and die!
-              }
-
-            //
-            // if sign bit is set then this was final command
-            //
-            if (cmd.s32 < 0)
-              break;
-          }
-
-        // continue as long as tile flush isn't complete
-      } while ((flags & SKC_TILE_FLAGS_FLUSH_COMPLETE) == 0);
-
-      // return if was the final flush
-      if (flags & SKC_TILE_FLAGS_FLUSH_FINALIZE)
-        return;
-
-      // update wip ttck_hi
-      ttck0 = ttck;
-    }
-}
-
-//
-//
-//
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "block.h"
+#include "styling_types.h"
+#include "atomic_cl.h"
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#define SKC_RENDER_SUBGROUP_MASK  (SKC_RENDER_SUBGROUP_SIZE - 1)
+
+//
+//
+//
+
+#if   ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_1()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      0
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 2 )
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_2()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      1
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 4 )
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_4()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      3
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 8 )
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_8()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      7
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 16)
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_16()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      15
+#endif
+
+//
+// tile state flag bits
+//
+
+typedef enum skc_tile_flags_e {
+
+  // FLUSH
+  SKC_TILE_FLAGS_FLUSH_FINALIZE    = 0x00000001,
+  SKC_TILE_FLAGS_FLUSH_UNWIND      = 0x00000002,
+  SKC_TILE_FLAGS_FLUSH_COMPLETE    = 0x00000004,
+
+  // OPACITY
+  SKC_TILE_FLAGS_SCATTER_SKIP      = 0x00000008,
+
+  //
+  // Note: testing for opacity and skipping scattering is on its way
+  // to becoming a much more programmable option because sometimes we
+  // may be compositing/blending from back-to-front and/or be using
+  // group blend rules that ignore opacity.
+  //
+  // The point is that all of these decisions should be encoded in
+  // styling commands and, as much as possible, removed from the final
+  // group/layer styling traversal render loop.
+  //
+
+} skc_tile_flags_e;
+
+//
+// COVER -- assumes availability of either fp16 or fp32
+//
+
+union skc_tile_cover
+{
+  struct {
+    SKC_RENDER_TILE_COVER             c[SKC_TILE_WIDTH];
+  } aN;
+
+#ifdef SKC_RENDER_TILE_COVER_VECTOR
+  struct {
+    SKC_RENDER_TILE_COVER_VECTOR      c[SKC_RENDER_TILE_COVER_VECTOR_COUNT];
+  } vN;
+#endif
+};
+
+//
+// COLOR -- assumes availability of either fp16 or fp32
+//
+
+union skc_tile_color
+{
+  union {
+    struct {
+      SKC_RENDER_TILE_COLOR           r;
+      SKC_RENDER_TILE_COLOR           g;
+      SKC_RENDER_TILE_COLOR           b;
+      SKC_RENDER_TILE_COLOR           a;
+    } rgba[SKC_TILE_WIDTH];
+  } aN;
+
+#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
+  union {
+    SKC_RENDER_TILE_COLOR_INTERLEAVED rgba[SKC_TILE_WIDTH];
+  } iN;
+#endif
+
+#ifdef SKC_RENDER_TILE_COLOR_VECTOR
+  union {
+    SKC_RENDER_TILE_COLOR_VECTOR      rgba[SKC_RENDER_TILE_COLOR_VECTOR_COUNT];
+  } vN;
+#endif
+
+  struct {
+    union {
+      struct {
+        SKC_RENDER_TILE_COLOR         r;
+        SKC_RENDER_TILE_COLOR         g;
+      };
+      SKC_RENDER_GRADIENT_FLOAT       distance;
+    };
+    union {
+      struct {
+        SKC_RENDER_TILE_COLOR         b;
+        SKC_RENDER_TILE_COLOR         a;
+      };
+      SKC_RENDER_GRADIENT_FLOAT       stoplerp;
+    };
+  } grad[SKC_TILE_WIDTH];
+};
+
+//
+// SHARED MEMORY STATE
+//
+
+#define SKC_RENDER_TILE_SMEM_WORDS ((SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT)
+
+#define SKC_RENDER_WIDE_AA_BYTES   (SKC_RENDER_TILE_SMEM_WORDS * sizeof(int) / SKC_RENDER_SUBGROUP_SIZE)
+#define SKC_RENDER_WIDE_AA_WIDTH   (SKC_RENDER_WIDE_AA_BYTES / sizeof(SKC_RENDER_WIDE_AA))
+
+//
+//
+//
+
+union skc_subgroup_smem
+{
+  //
+  // The tiles are stored in column-major / height-major order
+  //
+  // The final column is a guard column that is OK to write to but
+  // will never be read.  It simplifies the TTSB scatter but could be
+  // predicated if SMEM is really at a premium.
+  //
+#if ( SKC_RENDER_SUBGROUP_SIZE > 1 )
+  struct {
+    SKC_ATOMIC_UINT              area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
+  } atomic;
+#endif
+
+  struct {
+    int                          area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
+  } aN;
+
+  struct { // assumption is that height = subgroup
+    SKC_RENDER_AREA_V            area[SKC_TILE_WIDTH + 1][SKC_RENDER_SUBGROUP_SIZE];
+  } vN;
+
+  struct { // assumption is that height = subgroup
+    SKC_RENDER_WIDE_AA           area[SKC_RENDER_WIDE_AA_WIDTH][SKC_RENDER_SUBGROUP_SIZE];
+  } wide;
+
+  union skc_styling_cmd          cmds[(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT];
+
+  half                           gc  [(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT * 2];
+
+#if 0
+  //
+  // SPILL TO GMEM
+  //
+#if (SKC_REGS_COLOR_S > 0) || (SKC_REGS_COVER_S > 0)
+  struct {
+
+#if (SKC_REGS_COLOR_S > 0)
+    union skc_color_r            color[SKC_REGS_COLOR_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
+#endif
+
+#if (SKC_REGS_COVER_S > 0)
+    union float                  cover[SKC_REGS_COVER_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
+#endif
+
+  } regs;
+#endif
+  //
+  //
+  //
+#endif
+};
+
+//
+//
+//
+
+#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
+
+#define skc_subgroup_lane()  0
+
+#else
+
+#define skc_subgroup_lane()  get_sub_group_local_id()
+
+#endif
+
+//
+//
+//
+
+typedef skc_uint  skc_ttsk_lo_t;
+typedef skc_uint  skc_ttsk_hi_t;
+
+typedef skc_uint  skc_ttpk_lo_t;
+typedef skc_uint  skc_ttpk_hi_t;
+
+typedef skc_uint  skc_ttxk_lo_t;
+typedef skc_uint  skc_ttxk_hi_t;
+
+typedef skc_uint  skc_ttck_lo_t;
+typedef skc_uint  skc_ttck_hi_t;
+
+typedef skc_uint2 skc_ttck_t;
+
+typedef skc_int   skc_ttxb_t;
+
+//
+// TTCK (32-BIT COMPARE) v1:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          30          |    1   |    1   |   18  |  7  |  7  |
+//
+//
+// TTCK (32-BIT COMPARE) v2:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          30          |    1   |    1   |   15  |  9  |  8  |
+//
+//
+// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          27          |    1   |    1   |   18  |  9  |  8  |
+//
+
+static
+skc_uint
+skc_ttck_lo_get_ttxb_id(skc_ttck_lo_t const a)
+{
+  return a & SKC_TTCK_LO_MASK_ID;
+}
+
+static
+skc_layer_id
+skc_ttck_get_layer(skc_ttck_t const a)
+{
+  //
+  // FIXME -- a union with a ulong and a shift down and mask is
+  // probably faster on some architectures
+  //
+  skc_uint const lo = (a.lo >> SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
+  skc_uint const hi = (a.hi  & SKC_TTCK_HI_MASK_LAYER) << SKC_TTCK_LO_BITS_LAYER;
+
+  return lo | hi;
+}
+
+static
+skc_uint
+skc_ttck_hi_get_x(skc_ttck_hi_t const a)
+{
+  return SKC_BFE(a,SKC_TTCK_HI_BITS_X,SKC_TTCK_HI_OFFSET_X);
+}
+
+static
+skc_uint
+skc_ttck_hi_get_y(skc_ttck_hi_t const a)
+{
+  return a >> SKC_TTCK_HI_OFFSET_Y;
+}
+
+static
+skc_bool
+skc_ttck_equal_yxl(skc_ttck_t const a, skc_ttck_t const b)
+{
+  skc_uint const lo = (a.lo ^ b.lo) & SKC_BITS_TO_MASK_AT(SKC_TTCK_LO_BITS_LAYER,SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
+  skc_uint const hi = (a.hi ^ b.hi);
+
+  return (lo | hi) == 0;
+}
+
+static
+skc_bool
+skc_ttck_hi_equal_yx(skc_ttck_hi_t const a, skc_ttck_hi_t const b)
+{
+  return ((a ^ b) & SKC_TTCK_HI_MASK_YX) == 0;
+}
+
+static
+skc_bool
+skc_ttck_lo_is_prefix(skc_ttck_lo_t const a)
+{
+  return (a & SKC_TTCK_LO_MASK_PREFIX) != 0;
+}
+
+//
+// TILE TRACE SUBPIXEL
+//
+// The subpixels are encoded with either absolute tile coordinates
+// (32-bits) or packed in delta-encoded form form.
+//
+// For 32-bit subpixel packing of a 32x32 tile:
+//
+// A tile X is encoded as:
+//
+//   TX : 10 : unsigned min(x0,x1) tile subpixel coordinate.
+//
+//   SX :  6 : unsigned subpixel span from min to max x with range
+//             [0,32]. The original direction is not captured. Would
+//             be nice to capture dx but not necessary right now but
+//             could be in the future. <--- SPARE VALUES AVAILABLE
+//
+// A tile Y is encoded as:
+//
+//   TY : 10 : unsigned min(y0,y1) tile subpixel coordinate.
+//
+//   DY :  6 : signed subpixel delta y1-y0. The range of delta is
+//             [-32,32] but horizontal lines are not encoded so [1,32]
+//             is mapped to [0,31]. The resulting range [-32,31] fits
+//             in 6 bits.
+//
+// TTS:
+//
+//  0                        31
+//  |  TX |  SX  |  TY |  DY  |
+//  +-----+------+-----+------+
+//  |  10 |   6  |  10 |   6  |
+//
+
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_ty_pixel_v(SKC_RENDER_TTS_V const a)
+{
+  //
+  // extract the whole pixel y coordinate
+  //
+  return SKC_BFE(a,
+                 SKC_TTS_BITS_TY   - SKC_SUBPIXEL_RESL_Y_LOG2,
+                 SKC_TTS_OFFSET_TY + SKC_SUBPIXEL_RESL_Y_LOG2);
+}
+
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_xy_idx_v(SKC_RENDER_TTS_V const a)
+{
+  //
+  // get the linear array tile index of the pixel
+  //
+  return (((a & SKC_TTS_MASK_TX_PIXEL)
+
+#if   (SKC_SUBPIXEL_RESL_X_LOG2 > SKC_TILE_HEIGHT_LOG2)
+           >> (SKC_SUBPIXEL_RESL_X_LOG2 - SKC_TILE_HEIGHT_LOG2)
+#elif (SKC_SUBPIXEL_RESL_X_LOG2 < SKC_TILE_HEIGHT_LOG2)
+           << (SKC_TILE_HEIGHT_LOG2     - SKC_SUBPIXEL_RESL_X_LOG2)
+#endif
+
+           ) | skc_tts_get_ty_pixel_v(a));
+}
+
+#if 0
+static
+skc_ttx_v_s32_t
+skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
+{
+  skc_ttx_v_s32_t const dy = SKC_AS(skc_ttx_v_s32_t)a >> SKC_TTS_OFFSET_DY;
+
+  return (dy + SKC_AS(skc_ttx_v_s32_t)(~a >> 31));
+}
+#else
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
+{
+  SKC_RENDER_TTS_V_BITFIELD const dy = a >> SKC_TTS_OFFSET_DY;
+
+  return dy - (~a >> 31);
+}
+#endif
+
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_tx_subpixel_v(SKC_RENDER_TTS_V const a)
+{
+  return a & SKC_BITS_TO_MASK(SKC_SUBPIXEL_RESL_X_LOG2);
+}
+
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_sx_v(SKC_RENDER_TTS_V const a)
+{
+  return SKC_BFE(a,SKC_TTS_BITS_SX,SKC_TTS_OFFSET_SX);
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_aa_zero(__local union skc_subgroup_smem * SKC_RESTRICT const smem)
+{
+  //
+  // SIMD / CPU
+  //
+  //      &
+  //
+  // SIMT / GPU
+  //
+  // Note that atomic_init() is likely implemented as a simple
+  // assignment so there is no identifiable performance difference on
+  // current targets.
+  //
+  // If such an architecture appears in the future then we'll probably
+  // still want to implement this zero'ing operation as below but
+  // follow with an appropriate fence that occurs before any scatter
+  // operations.
+  //
+  // The baroque expansion below improves performance on Intel GEN by,
+  // presumably, achieving the 64-byte per clock SLM write as well as
+  // minimizing the overall number of SEND() block initializations and
+  // launches.
+  //
+  // Intel GENx has a documented 64 byte per cycle SLM write limit.
+  // So having each lane in an 8 lane subgroup zero-write 8 bytes is
+  // probably a safe bet (Later: benchmarking backs this up!).
+  //
+  // Note there is no reason at this time to unroll this loop.
+  //
+  for (uint ii=0; ii<SKC_RENDER_WIDE_AA_WIDTH; ii++)
+    smem->wide.area[ii][skc_subgroup_lane()] = ( 0 );
+}
+
+//
+// Note this is going to be vectorizable on most architectures.
+//
+// The return of the key translation feature might complicate things.
+//
+
+static
+void
+skc_scatter_ttpb(__global skc_ttxb_t        const * SKC_RESTRICT const ttxb_extent,
+                 __local  union skc_subgroup_smem * SKC_RESTRICT const smem,
+                 skc_block_id_t                                  const pb_id)
+{
+  skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();
+
+#if   ( SKC_TILE_RATIO == 1 )
+
+  SKC_RENDER_TTP_V const ttp_v = ttxb_extent[offset];
+
+#elif ( SKC_TILE_RATIO == 2 )
+
+  SKC_RENDER_TTP_V const ttp_v = vload2(offset,ttxb_extent);
+
+#else
+
+#error("tile ratio greater than 2 not supported")
+
+#endif
+
+  //
+  // Note there is no need to use an atomic for this operation on the
+  // current group of target platforms... but this may change if
+  // atomic ops truly go through a different path.
+  //
+  // As noted above, this direct increment is probably faster and can
+  // always be followed by a fence.
+  //
+  // Furthermore, note that the key sorting orders all ttck keys
+  // before ttpk keys.
+  //
+
+  //
+  // FIXME -- if the SMEM store is wider than bank word count then we
+  // might want to odd-even interleave the TTP values if the target
+  // device can't handle 64-bit stores
+  //
+
+  //
+  // skipping per-key translation for now
+  //
+  smem->vN.area[0][skc_subgroup_lane()] += ttp_v << (SKC_SUBPIXEL_RESL_X_LOG2 + 1);
+}
+
+//
+// Note that skc_scatter_ttsb is *not* vectorizable unless the
+// architecture supports a "scatter-add" capability.  All relevant
+// GPUs support atomic add on shared/local memory and thus support
+// scatter-add.
+//
+
+static
+void
+skc_scatter_ttsb(__global skc_ttxb_t        const * SKC_RESTRICT const ttxb_extent,
+                 __local  union skc_subgroup_smem * SKC_RESTRICT const smem,
+                 skc_block_id_t                                  const sb_id)
+{
+  skc_uint         const offset = sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
+
+  SKC_RENDER_TTS_V const tts_v  = ttxb_extent[offset];
+
+  //
+  // Skipping per-key translation for now
+  //
+
+  // Index into tile
+  //
+  // The tiles are stored in column-major / height-major order
+  //
+  // The final column is a guard column that is OK to write to but
+  // will never be read.  It simplifies the TTSB scatter but could be
+  // predicated if SMEM is really at a premium.
+  //
+
+  SKC_RENDER_TTS_V_BITFIELD const xy_idx = skc_tts_get_xy_idx_v(tts_v);
+
+#if 0
+  if (tts_v != SKC_TTS_INVALID)
+    printf("(%08X) = %u\n",tts_v,xy_idx);
+#endif
+
+  //
+  // adjust subpixel range to max y
+  //
+  // range is stored as [-32,31] and when read [0,31] is mapped to
+  // [1,32] because a dy of 0 is not possible.
+  //
+  // more succinctly: if dy >= 0 then ++dy
+  //
+  SKC_RENDER_TTS_V_BITFIELD const dy     = skc_tts_get_dy_v(tts_v);
+
+  //
+  // FIXME -- benchmark performance of setting dy to 0 if ttsv.vN is invalid?
+  //
+
+  // this "min(x0) * 2 + dx" is equivalent to "x0 + x1"
+  SKC_RENDER_TTS_V_BITFIELD const widths = skc_tts_get_tx_subpixel_v(tts_v) * 2 + skc_tts_get_sx_v(tts_v);
+
+  // Calculate left and right coverage contribution trapezoids
+  SKC_RENDER_TTS_V_BITFIELD const left   = dy * widths;
+  SKC_RENDER_TTS_V_BITFIELD const right  = (dy << (SKC_SUBPIXEL_RESL_X_LOG2 + 1)) - left;
+
+  //
+  // Accumulate altitudes and areas
+  //
+  // Optimization: if the device supports an CPU/SIMD vector-add or
+  // GPU/SIMT scatter-add atomic int2 add operation then placing the
+  // ALT and AREA values side-by-side would halve the number of
+  // additions.
+  //
+#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
+  //
+  // CPU/SIMD
+  //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                                 \
+  if (tts_v C != SKC_TTS_INVALID) {                             \
+    smem->aN.area[SKC_TILE_HEIGHT + xy_idx C] += left  C;       \
+    smem->aN.area[                  xy_idx C] += right C;       \
+  }
+
+#else
+  //
+  // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD
+  //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                                         \
+  if (tts_v C != SKC_TTS_INVALID) {                                     \
+    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area +           \
+                                          SKC_TILE_HEIGHT   + xy_idx C, \
+                                          left C);                      \
+    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + xy_idx C, \
+                                          right C);                     \
+  }
+#endif
+
+  SKC_RENDER_TTSB_EXPAND();
+}
+
+//
+// Note that 2048.0 can be represented exactly with fp16... fortuitous!
+//
+
+#define SKC_RENDER_FILL_MAX_AREA          (2u * SKC_SUBPIXEL_RESL_X * SKC_SUBPIXEL_RESL_Y)
+#define SKC_RENDER_FILL_MAX_AREA_2        (2u * SKC_RENDER_FILL_MAX_AREA)
+#define SKC_RENDER_FILL_EVEN_ODD_MASK     (SKC_RENDER_FILL_MAX_AREA_2 - 1)
+#define SKC_RENDER_FILL_MAX_AREA_RCP_F32  (SKC_RENDER_TILE_COVER)(1.0f / SKC_RENDER_FILL_MAX_AREA)
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_nonzero(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
+                       union skc_tile_cover            * SKC_RESTRICT const cover,
+                       union skc_tile_color            * SKC_RESTRICT const color)
+{
+  SKC_RENDER_ACC_COVER_INT area = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      area                                   += smem->vN.area[ii][skc_subgroup_lane()];
+      SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
+      SKC_RENDER_TILE_COVER     const nonzero = SKC_CONVERT(SKC_RENDER_TILE_COVER)(min(trapabs,SKC_RENDER_FILL_MAX_AREA));
+
+      cover->aN.c[ii] = nonzero * (SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA_RCP_F32);
+    }
+}
+
+static
+void
+skc_tile_cover_evenodd(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
+                       union skc_tile_cover            * SKC_RESTRICT const cover,
+                       union skc_tile_color            * SKC_RESTRICT const color)
+{
+  SKC_RENDER_ACC_COVER_INT area = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      area                                   += smem->vN.area[ii][skc_subgroup_lane()];
+      SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
+      SKC_RENDER_ACC_COVER_UINT const reflect = abs(SKC_AS(SKC_RENDER_ACC_COVER_INT)((trapabs & SKC_RENDER_FILL_EVEN_ODD_MASK) - SKC_RENDER_FILL_MAX_AREA));
+
+      cover->aN.c[ii] = SKC_CONVERT(SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA - reflect) * (SKC_RENDER_TILE_COVER)SKC_RENDER_FILL_MAX_AREA_RCP_F32;
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_color_fill_solid(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
+                          uint                                 * SKC_RESTRICT const cmd_next,
+                          union skc_tile_color                 * SKC_RESTRICT const color)
+{
+  //
+  // rgba = solid fill
+  //
+  __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;
+
+  *cmd_next += 2;
+
+#if !defined( SKC_RENDER_TILE_COLOR_VECTOR )
+
+  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].r = rg.lo;
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].g = rg.hi;
+
+  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].b = ba.lo;
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].a = ba.hi;
+
+#else
+
+  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
+  SKC_RENDER_TILE_COLOR      const r  = rg.lo;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(r);
+
+  SKC_RENDER_TILE_COLOR      const g  = rg.hi;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.even  = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(g);
+
+  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
+  SKC_RENDER_TILE_COLOR      const b  = ba.lo;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.odd  = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(b);
+
+  SKC_RENDER_TILE_COLOR      const a  = ba.hi;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.odd   = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(a);
+
+#endif
+}
+
+//
+// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
+//
+// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
+//
+// Lerp in two fma/mad ops:
+//
+//    t * b + ((-t) * a + a)
+//
+// Note: OpenCL documents mix() as being implemented as:
+//
+//    a + (b - a) * t
+//
+// But this may be a native instruction on some devices.  For example,
+// on GEN9 there is an LRP "linear interoplation" function but it
+// doesn't appear to support half floats.
+//
+
+#if 1
+#define SKC_LERP(a,b,t)  mad(t,b,mad(-(t),a,a))
+#else
+#define SKC_LERP(a,b,t)  mix(a,b,t)
+#endif
+
+//
+// CPUs have a mock local address space so copying the gradient header
+// is probably not useful.  Just read directly from global.
+//
+
+#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
+#define SKC_RENDER_GRADIENT_SPACE  __local
+#else
+#define SKC_RENDER_GRADIENT_SPACE  __global
+#endif
+
+//
+// gradient is non-vertical
+//
+// removed the vertical (actually, horizontal) special case
+//
+
+static
+void
+skc_tile_color_fill_gradient_linear_nonvertical(__local  union skc_subgroup_smem     * SKC_RESTRICT const smem,
+                                                __global union skc_styling_cmd const * SKC_RESTRICT const commands,
+                                                uint                                 * SKC_RESTRICT const cmd_next,
+                                                union skc_tile_color                 * SKC_RESTRICT const color,
+                                                skc_ttck_hi_t                                       const ttck_hi)
+{
+  //
+  // Where is this tile?
+  //
+  // Note that the gradient is being sampled from pixel centers.
+  //
+  SKC_RENDER_GRADIENT_FLOAT const y =
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) I##.5f P
+    (SKC_RENDER_GRADIENT_FLOAT)( SKC_RENDER_SCANLINE_VECTOR_EXPAND() ) +
+    (skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE));
+
+  float                     const x = 0.5f + (skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH);
+
+  //
+  // Get starting numerator and denominator
+  //
+  // Note: if gh[0].dx is exactly 0.0f then this is a vertical
+  // gradient and can be handled by a special opcode.
+  //
+  // Note: the mad() ordering is slightly different than the original
+  // CUDA implementation.
+  //
+  union skc_gradient_vector const gv       = { vload4(0,&commands[*cmd_next].f32) };
+
+  *cmd_next += 4;
+
+  float                     const gv_x_dot = mad(x,gv.dx,gv.p0);
+  SKC_RENDER_GRADIENT_FLOAT const gv_numer = mad(y,gv.dy,gv_x_dot);
+
+  //
+  // Where are columns along gradient vector?
+  //
+  // TODO: Note that the gv_denom isn't multiplied through.
+  //
+  // Please doublecheck this... but I recall that in certain cases
+  // this wipes out some precision and results in minor but noticeable
+  // gradient artifacts.
+  //
+  // All arguments are scalars except gv_numer so a simpler
+  // evaluation might save some flops.
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->grad[ii].distance = mad(gv.dx,(float)ii,gv_numer) * gv.denom;
+
+  //
+  // is gradient non-repeating, repeating or reflecting?
+  //
+  switch (commands[(*cmd_next)++].u32)
+    {
+    case SKC_STYLING_GRADIENT_TYPE_LINEAR_NON_REPEATING:
+      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+        color->grad[ii].distance = clamp(color->grad[ii].distance,0.0f,1.0f);
+      break;
+
+    case SKC_STYLING_GRADIENT_TYPE_LINEAR_REPEATING:
+      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+        color->grad[ii].distance -= floor(color->grad[ii].distance);
+      break;
+
+    default: // PXL_STYLING_GRADIENT_TYPE_LINEAR_REFLECTING
+      //
+      // OPTIMIZATION: Can this be done in fewer than ~4 ops?
+      //
+      // Note: OpenCL "rint()" is round-to-nearest-even integer!
+      //
+      // Note: the floor() "round to -inf" op is implemented in the
+      // GEN op 'FRC' so probably don't use trunc() when floor will
+      // suffice.
+      //
+
+      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+        {
+          SKC_RENDER_GRADIENT_FLOAT dist_abs = fabs(color->grad[ii].distance);
+          color->grad[ii].distance = fabs(dist_abs - rint(dist_abs));
+        }
+    }
+
+  //
+  // initialize "stoplerp" for all columns
+  //
+  uint const slope_count = commands[(*cmd_next)++].u32;
+  uint const gd_n_v1     = commands[(*cmd_next)++].u32; // REMOVE ME
+
+  {
+    float const slope = commands[(*cmd_next)++].f32;
+
+    // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+    for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+      color->grad[ii].stoplerp = color->grad[ii].distance * slope;
+  }
+
+  //
+  // compute stoplerp for remaining stops
+  //
+  for (int jj=1; jj<slope_count; jj++)
+    {
+      float const floor = (float)jj;
+      float const slope = commands[(*cmd_next)++].f32;
+
+      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+        color->grad[ii].stoplerp = mad(min(0, color->grad[ii].stoplerp - floor),slope,color->grad[ii].stoplerp);
+    }
+
+  //
+  // copy gradient colors to local memory
+  //
+  uint const gd_n = slope_count + 1;
+
+#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
+  //
+  // copy entire gradient descriptor to local memory
+  //
+  for (uint ii=skc_subgroup_lane(); ii<gd_n*4; ii+=SKC_RENDER_SUBGROUP_SIZE)
+    smem->cmds[ii].u32 = commands[*cmd_next + ii].u32;
+
+  __local  half const * const SKC_RESTRICT gc = smem->gc + 0;
+#else
+  //
+  // prefetch entire gradient header
+  //
+  // no noticeable impact on performance
+  //
+  // prefetch(&commands[*cmd_next].u32,gh_words);
+  //
+  __global half const * const SKC_RESTRICT gc = commands[*cmd_next].f16a2 + 0;
+#endif
+
+  //
+  // adjust cmd_next so that V1 structure is consumed -- FIXME
+  //
+  *cmd_next += SKC_GRADIENT_CMD_WORDS_V2_ADJUST(gd_n_v1,gd_n);
+
+  //
+  // lerp between color pair stops
+  //
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      //
+      // Finally, we have the gradient stop index and the color stop
+      // pair lerp fraction
+      //
+      // Note that if these are vector values then a gather operation
+      // must occur -- there may be platforms (AVX-512?) that can
+      // perform an explicit gather on a vector type but it's not
+      // really expressible in OpenCL except implicitly with a
+      // workgroup of work items.
+      //
+      // ***********************
+      //
+      // FIXME -- USE HERB'S SINGLE FMA LERP
+      //
+      // ***********************
+      //
+      SKC_RENDER_GRADIENT_STOP const gc_stop = SKC_CONVERT(SKC_RENDER_GRADIENT_STOP)(color->grad[ii].stoplerp);
+      SKC_RENDER_GRADIENT_FRAC const gc_frac = SKC_CONVERT(SKC_RENDER_GRADIENT_FRAC)(color->grad[ii].stoplerp - floor(color->grad[ii].stoplerp));
+
+      {
+        SKC_RENDER_TILE_COLOR lo, hi;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
+          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + 0,gc); \
+          lo C                                = cc.lo;                  \
+          hi C                                = cc.hi;                  \
+        }
+
+        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+        color->aN.rgba[ii].r = SKC_LERP(lo,hi,gc_frac);
+      }
+
+      //
+      //
+      //
+      {
+        SKC_RENDER_TILE_COLOR lo, hi;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
+          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n,gc); \
+          lo C                                = cc.lo;                  \
+          hi C                                = cc.hi;                  \
+        }
+
+        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+        color->aN.rgba[ii].g = SKC_LERP(lo,hi,gc_frac);
+      }
+
+      //
+      //
+      //
+      {
+        SKC_RENDER_TILE_COLOR lo, hi;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
+          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*2,gc); \
+          lo C                                = cc.lo;                  \
+          hi C                                = cc.hi;                  \
+        }
+
+        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+        color->aN.rgba[ii].b = SKC_LERP(lo,hi,gc_frac);
+      }
+
+      //
+      //
+      //
+      {
+        SKC_RENDER_TILE_COLOR lo, hi;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
+          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*3,gc); \
+          lo C                                = cc.lo;                  \
+          hi C                                = cc.hi;                  \
+        }
+
+        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+        color->aN.rgba[ii].a = SKC_LERP(lo,hi,gc_frac);
+      }
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_blend_over(union skc_tile_color       * SKC_RESTRICT const color_acc,
+                    union skc_tile_cover const * SKC_RESTRICT const cover_wip,
+                    union skc_tile_color const * SKC_RESTRICT const color_wip)
+{
+  //
+  // fralunco = cover.wip * acc.a
+  //
+  // acc.r    =  fralunco * wip.r + acc.r
+  // acc.g    =  fralunco * wip.g + acc.g
+  // acc.b    =  fralunco * wip.b + acc.b
+  // acc.a    = -fralunco * wip.a + acc.a
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      SKC_RENDER_TILE_COVER const fralunco = cover_wip->aN.c[ii] * color_acc->aN.rgba[ii].a;
+
+      color_acc->aN.rgba[ii].r = mad(+fralunco,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
+      color_acc->aN.rgba[ii].g = mad(+fralunco,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
+      color_acc->aN.rgba[ii].b = mad(+fralunco,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
+      color_acc->aN.rgba[ii].a = mad(-fralunco,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_blend_plus(union skc_tile_color       * SKC_RESTRICT const color_acc,
+                    union skc_tile_cover const * SKC_RESTRICT const cover_wip,
+                    union skc_tile_color const * SKC_RESTRICT const color_wip)
+{
+  //
+  // cover_min = min(cover.wip,a.acc)
+  //
+  // r.acc =  cover_min * r.wip + r.acc
+  // g.acc =  cover_min * g.wip + g.acc
+  // b.acc =  cover_min * b.wip + b.acc
+  // a.acc = -cover_min * a.wip + a.acc
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      SKC_RENDER_TILE_COVER const cover_min = fmin(cover_wip->aN.c[ii],color_acc->aN.rgba[ii].a);
+
+      color_acc->aN.rgba[ii].r = mad(+cover_min,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
+      color_acc->aN.rgba[ii].g = mad(+cover_min,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
+      color_acc->aN.rgba[ii].b = mad(+cover_min,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
+      color_acc->aN.rgba[ii].a = mad(-cover_min,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_blend_multiply(union skc_tile_color       * SKC_RESTRICT const color_acc,
+                        union skc_tile_cover const * SKC_RESTRICT const cover_wip,
+                        union skc_tile_color const * SKC_RESTRICT const color_wip)
+{
+  //
+  // r.acc = (cover.wip * r.wip) * r.acc
+  // g.acc = (cover.wip * g.wip) * g.acc
+  // b.acc = (cover.wip * b.wip) * b.acc
+  // a.acc = (cover.wip * a.wip) * (1.0 - a.acc) <-- a.acc is already (1.0 - alpha)
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      color_acc->aN.rgba[ii].r *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].r;
+      color_acc->aN.rgba[ii].g *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].g;
+      color_acc->aN.rgba[ii].b *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].b;
+      color_acc->aN.rgba[ii].a *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].a;
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_blend_knockout(union skc_tile_cover       * SKC_RESTRICT const cover_acc,
+                        union skc_tile_color       * SKC_RESTRICT const color_acc,
+                        union skc_tile_cover const * SKC_RESTRICT const cover_wip,
+                        union skc_tile_color const * SKC_RESTRICT const color_wip)
+{
+  //
+  // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
+  // cover.acc         = cover.acc + cover.wip.contrib
+  //
+  // r.acc =  cover.wip.contrib * r.wip + r.acc
+  // g.acc =  cover.wip.contrib * g.wip + g.acc
+  // b.acc =  cover.wip.contrib * b.wip + b.acc
+  // a.acc = -cover.wip.contrib * a.wip * a.acc
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      SKC_RENDER_TILE_COVER const contrib = (1 - cover_acc->aN.c[ii]) * cover_wip->aN.c[ii];
+
+      cover_acc->aN.c[ii]     += contrib;
+
+      color_acc->aN.rgba[ii].r = mad(+contrib,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
+      color_acc->aN.rgba[ii].g = mad(+contrib,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
+      color_acc->aN.rgba[ii].b = mad(+contrib,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
+      color_acc->aN.rgba[ii].a = mad(-contrib,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_msk_copy_wip(union skc_tile_cover       * SKC_RESTRICT const cover_msk,
+                            union skc_tile_cover const * SKC_RESTRICT const cover_wip)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover_msk->aN.c[ii] = cover_wip->aN.c[ii];
+
+#else
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover_msk->vN.c[ii] = cover_wip->vN.c[ii];
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_msk_copy_acc(union skc_tile_cover       * SKC_RESTRICT const cover_msk,
+                            union skc_tile_cover const * SKC_RESTRICT const cover_acc)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover_msk->aN.c[ii] = cover_acc->aN.c[ii];
+
+#else
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNTN)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover_msk->vN.c[ii] = cover_acc->vN.c[ii];
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_accumulate(union skc_tile_cover       * SKC_RESTRICT const cover_acc,
+                          union skc_tile_cover const * SKC_RESTRICT const cover_wip)
+{
+  //
+  // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
+  // cover.acc         = cover.acc + cover.wip.contrib
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover_acc->aN.c[ii] = mad(1 - cover_acc->aN.c[ii],cover_wip->aN.c[ii],cover_acc->aN.c[ii]);
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_wip_mask(union skc_tile_cover       * SKC_RESTRICT const cover_wip,
+                        union skc_tile_cover const * SKC_RESTRICT const cover_msk)
+{
+  //
+  // cover.wip *= cover.msk
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover_wip->aN.c[ii] *= cover_msk->aN.c[ii];
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_wip_zero(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover->aN.c[ii] = 0;
+
+#else
+  //
+  // GEN9 compiler underperforms on this
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover->vN.c[ii] = 0;
+
+#endif
+}
+
+static
+void
+skc_tile_cover_acc_zero(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover->aN.c[ii] = 0;
+
+#else
+  //
+  // GEN9 compiler underperforms on this
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover->vN.c[ii] = 0;
+
+#endif
+}
+
+static
+void
+skc_tile_cover_msk_zero(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover->aN.c[ii] = 0;
+
+#else
+  //
+  // GEN9 compiler underperforms on this
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover->vN.c[ii] = 0;
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_msk_one(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover->aN.c[ii] = 1;
+
+#else
+  //
+  // GEN9 compiler underperforms on this
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover->vN.c[ii] = SKC_RENDER_TILE_COVER_VECTOR_ONE;
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_msk_invert(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover->aN.c[ii] = 1 - cover->aN.c[ii];
+
+#else
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover->vN.c[ii] = 1 - cover->vN.c[ii];
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_color_wip_zero(union skc_tile_color * SKC_RESTRICT const color)
+{
+#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      color->aN.rgba[ii].r = 0;
+      color->aN.rgba[ii].g = 0;
+      color->aN.rgba[ii].b = 0;
+      color->aN.rgba[ii].a = 1;
+    }
+
+#else
+  //
+  // DISABLED ON GEN9 -- probably a compiler bug
+  //
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.even = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.even  = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.odd  = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.odd   = 1;
+#endif
+}
+
+static
+void
+skc_tile_color_acc_zero(union skc_tile_color * SKC_RESTRICT const color)
+{
+#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      color->aN.rgba[ii].r = 0;
+      color->aN.rgba[ii].g = 0;
+      color->aN.rgba[ii].b = 0;
+      color->aN.rgba[ii].a = 1;
+    }
+
+#else
+  //
+  // DISABLED ON GEN9 -- probably a compiler bug
+  //
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.even = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.even  = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.odd  = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.odd   = 1;
+#endif
+}
+
+//
+//
+//
+
+static
+bool
+skc_tile_color_test_opacity(union skc_tile_color const * SKC_RESTRICT const color)
+{
+  //
+  // returns true if tile is opaque
+  //
+  // various hacks to test for complete tile opacity
+  //
+  // note that front-to-back currently has alpha at 0.0f -- this can
+  // be harmonized to use a traditional alpha if we want to support
+  // rendering in either direction
+  //
+  // hack -- ADD/MAX/OR all alphas together and test for non-zero
+  //
+  SKC_RENDER_TILE_COLOR t = color->aN.rgba[0].a;
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+  for (uint ii=1; ii<SKC_TILE_WIDTH; ii++)
+    t += color->aN.rgba[ii].a;
+
+#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  return !any(t != ( 0 ));
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
+  //
+  // SIMT - scalar per lane
+  //
+  return !sub_group_any(t != 0);
+
+#else
+  //
+  // SIMT - vector per lane
+  //
+  return !sub_group_any(any(t != ( 0 )));
+
+#endif
+
+  //
+  // TODO: The alternative vector-per-lane implementation below is
+  // *not* believed to be performant because the terse vector-wide
+  // test is just hiding a series of comparisons and is likely worse
+  // than the blind ADD/MAX/OR'ing of all alphas followed by a single
+  // test.
+  //
+#if 0
+  //
+  // SIMT - vector per lane
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT-1)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    {
+      if (sub_group_any(any(color->vN.ba[ii].a != ( 0 ))))
+        return false;
+    }
+
+  return true;
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_background_over(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
+                         uint                                 * SKC_RESTRICT const cmd_next,
+                         union skc_tile_color                 * SKC_RESTRICT const color)
+{
+  //
+  // acc.r = acc.a * r + acc.r
+  // acc.g = acc.a * g + acc.g
+  // acc.b = acc.a * b + acc.b
+  //
+  __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;
+
+  *cmd_next += 2;
+
+  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].r = mad(color->aN.rgba[ii].a,rg.lo,color->aN.rgba[ii].r);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].g = mad(color->aN.rgba[ii].a,rg.hi,color->aN.rgba[ii].g);
+
+  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].b = mad(color->aN.rgba[ii].a,ba.lo,color->aN.rgba[ii].b);
+}
+
+//
+//
+//
+
+// #define SKC_SURFACE_IS_BUFFER
+#ifdef  SKC_SURFACE_IS_BUFFER
+
+static
+void
+skc_surface_composite_u8_rgba(__global SKC_RENDER_SURFACE_U8_RGBA * SKC_RESTRICT const surface,
+                              skc_uint                                           const surface_pitch,
+                              union skc_tile_color          const * SKC_RESTRICT const color,
+                              skc_ttck_hi_t                                      const ttck_hi)
+{
+  //
+  // NEW MAJOR OPTIMIZATION:
+  //
+  // Rotating and rasterizing the original world transform by -90
+  // degrees and then rendering the scene scene by +90 degrees enables
+  // all the final surface composite to be perfomed in perfectly
+  // coalesced wide transactions.
+  //
+  // For this reason, linear access to the framebuffer is preferred.
+  //
+  // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
+  //
+  // NOTE THIS IS TRANSPOSED BY 90 DEGREES
+  //
+  // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
+  // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
+  //
+  // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
+  // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
+  //
+  // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
+  // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
+  //
+  uint const pitch = surface_pitch / SKC_RENDER_SCANLINE_VECTOR_SIZE;
+  uint const x     = skc_ttck_hi_get_x(ttck_hi);
+  uint const y     = skc_ttck_hi_get_y(ttck_hi) ;
+  uint const base  = x * SKC_TILE_WIDTH * pitch + y * (SKC_TILE_HEIGHT / SKC_RENDER_SCANLINE_VECTOR_SIZE) + skc_subgroup_lane();
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      SKC_RENDER_SURFACE_U8_RGBA rgba = ( 0xFF000000 );
+
+      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].r * 255);
+      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].g * 255) << 8;
+      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].b * 255) << 16;
+
+      surface[base + ii * pitch] = rgba;
+
+      // printf("%08v2X\n",rgba);
+    }
+}
+
+#else
+
+static
+void
+skc_surface_composite_u8_rgba(__write_only image2d_t                          surface,
+                              union skc_tile_color const * SKC_RESTRICT const color,
+                              skc_ttck_hi_t                                   const ttck_hi)
+{
+  //
+  // NEW MAJOR OPTIMIZATION:
+  //
+  // Rotating and rasterizing the original world transform by -90
+  // degrees and then rendering the scene scene by +90 degrees enables
+  // all the final surface composite to be perfomed in perfectly
+  // coalesced wide transactions.
+  //
+  // For this reason, linear access to the framebuffer is preferred.
+  //
+  // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
+  //
+  // NOTE THIS IS TRANSPOSED BY 90 DEGREES
+  //
+  // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
+  // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
+  //
+  // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
+  // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
+  //
+  // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
+  // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
+  //
+
+#if 1
+  int x = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;
+  int y = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                       \
+        SKC_RENDER_SURFACE_WRITE(surface,               \
+                                 (int2)(x,y+I),         \
+                                 color->iN.rgba[ii] A); \
+      }
+
+#else
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                               \
+        SKC_RENDER_SURFACE_COLOR const rgba =                   \
+          (SKC_RENDER_SURFACE_COLOR)                            \
+          (color->aN.rgba[ii].r C,                              \
+           color->aN.rgba[ii].g C,                              \
+           color->aN.rgba[ii].b C,                              \
+           1.0);                                                \
+        SKC_RENDER_SURFACE_WRITE(surface,(int2)(x,y+I),rgba);   \
+      }
+
+#endif
+
+      SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+      x += 1;
+    }
+#else
+    int x = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);
+    int y = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;
+
+    // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+    for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+      {
+#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                       \
+        SKC_RENDER_SURFACE_WRITE(surface,               \
+                                 (int2)(x+I,y+ii),      \
+                                 color->iN.rgba[ii] A); \
+      }
+
+#else
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                               \
+      SKC_RENDER_SURFACE_COLOR const rgba =                     \
+        (SKC_RENDER_SURFACE_COLOR)                              \
+        (color->aN.rgba[ii].r C,                                \
+        color->aN.rgba[ii].g C,                                 \
+        color->aN.rgba[ii].b C,                                 \
+        1.0);                                                   \
+      SKC_RENDER_SURFACE_WRITE(surface,(int2)(x+I,y+ii),rgba);  \
+    }
+
+#endif
+
+      SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+    }
+
+#endif
+}
+
+#endif
+
+//
+//
+//
+static
+uint const
+skc_ttck_lane(uint const ttck_idx)
+{
+  return ttck_idx & SKC_RENDER_SUBGROUP_MASK;
+}
+
+//
+// RENDER KERNEL
+//
+
+__kernel
+SKC_RENDER_KERNEL_ATTRIBS
+void
+skc_kernel_render(__global   union  skc_layer_node   const * SKC_RESTRICT const layers,
+                  __global   struct skc_group_node   const * SKC_RESTRICT const groups,
+                  __global   union  skc_styling_cmd  const * SKC_RESTRICT const commands,     // FIXME -- rename
+
+                  __global   skc_ttck_t              const * SKC_RESTRICT const ttck_keys,    // rename: keys
+                  skc_uint                                                const ttck_count,   // rename: key_count
+
+                  __global   uint                    const * SKC_RESTRICT const ttck_offsets, // rename: offsets
+                  skc_uint                                                const tile_count,   // rename: offset_count
+
+                  __global   skc_ttxb_t              const * SKC_RESTRICT const ttxb_extent,
+#ifdef SKC_SURFACE_IS_BUFFER
+                  __global   void                          * SKC_RESTRICT const surface,
+#else
+                  __write_only image2d_t                                        surface,
+#endif
+#ifdef SKC_SURFACE_IS_BUFFER
+                  skc_uint                                                const surface_pitch,
+#endif
+                  uint4                                                   const tile_clip)    // rename: clip
+{
+  //
+  // Each subgroup is responsible for a tile.  No extra subgroups are
+  // launched.
+  //
+  // FIXME -- might be better implemented as a "grid stride loop" if
+  // Intel GEN really has a local memory "quantum" of 4KB which means
+  // we would need to launch 4 subgroups per workgroup.
+  //
+  // Confirmed: GEN8 has 4KB SLM workgroup min while GEN9 is 1KB.
+  //
+
+  //
+  // declare tile cover and color registers
+  //
+  // this used to be a neat unified struct but the Intel GEN compiler
+  // wasn't cooperating and spilling to private memory even though all
+  // registers were indexed by constants
+  //
+  union skc_tile_color  color_wip;
+  union skc_tile_color  color_acc;
+
+  union skc_tile_cover  cover_wip;
+  union skc_tile_cover  cover_acc;
+  union skc_tile_cover  cover_msk;
+
+  //
+  // which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler is recognizing get_group_id(0)
+  // as a uniform but the alternative calculation used when there are
+  // multiple subgroups per workgroup is not cooperating and
+  // driving spillage elsewhere.
+  //
+#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
+  skc_uint const ttck_offset_idx = get_group_id(0);
+#else
+  skc_uint const ttck_offset_idx = get_group_id(0) * SKC_RENDER_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+  //
+  // load the starting ttck for this offset and get a bound on the max
+  // number of keys that might be loaded
+  //
+  // these are uniform across all subgroup lanes
+  //
+  skc_uint ttck_idx = ttck_offsets[ttck_offset_idx];
+
+  //
+  // FIXME -- SIMD/CPU version should probaby load a 256-bit (4-wide)
+  // vector of ttck keys
+  //
+#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
+
+  skc_ttck_t ttck = ttck_keys[ttck_idx];
+
+#else
+
+  uint const ttck_base = ttck_idx & ~SKC_RENDER_SUBGROUP_MASK;
+  uint const ttck_lane = ttck_idx &  SKC_RENDER_SUBGROUP_MASK;
+  skc_ttck_t ttck_s    = ttck_keys[min(ttck_base+max(get_sub_group_local_id(),ttck_lane),ttck_count-1)]
+
+#endif
+
+  //
+  // set up style group/layer state
+  //
+  struct skc_styling_group {
+    union skc_group_range range;
+    skc_uint              depth;
+    skc_uint              id;
+  } group;
+
+  group.range.lo = 0;
+  group.range.hi = SKC_UINT_MAX;
+  group.depth    = 0;
+  group.id       = SKC_UINT_MAX;
+
+  //
+  // start with clear tile opacity, knockout and flag bits
+  //
+  // uint color_acc_opacity  = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
+  // uint cover_acc_knockout = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
+  //
+  skc_uint flags = 0;
+
+  //
+  // declare and initialize accumulators
+  //
+#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
+  __local union skc_subgroup_smem                      smem[1];
+#else
+  __local union skc_subgroup_smem                      smem_wg[SKC_RENDER_WORKGROUP_SUBGROUPS];
+  __local union skc_subgroup_smem * SKC_RESTRICT const smem = smem_wg + get_sub_group_id();
+#endif
+
+#ifdef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
+  //
+  // select the initial ttck key
+  //
+  skc_ttck_t ttck;
+#if 0
+  ttck    = sub_group_broadcast(ttck_s,ttck_lane);    // SHOULD WORK BUT .4454 COMPILER IS BROKEN
+#else
+  ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane); // EXPLICIT WORKAROUND
+  ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane);
+#endif
+
+#endif
+
+  //
+  // save the first key so we know what tile we're in
+  //
+  skc_ttck_t ttck0 = ttck;
+
+  //
+  // evaluate the coarse clip as late as possible
+  //
+  skc_uint const ttck_hi_x = skc_ttck_hi_get_x(ttck0.hi);
+
+  if ((ttck_hi_x < tile_clip.lo.x) || (ttck_hi_x >= tile_clip.hi.x))
+    return;
+
+  skc_uint const ttck_hi_y = skc_ttck_hi_get_y(ttck0.hi);
+
+  if ((ttck_hi_y < tile_clip.lo.y) || (ttck_hi_y >= tile_clip.hi.y))
+    return;
+
+#if 0
+  printf("< %u, %u >\n",ttck_hi_x,ttck_hi_y);
+#endif
+
+  //
+  // load -> scatter -> flush
+  //
+  while (true)
+    {
+      // if scattering is disabled then just run through ttck keys
+      bool const is_scatter_enabled = (flags & SKC_TILE_FLAGS_SCATTER_SKIP) == 0;
+
+      // need to clear accumulators before a scatter loop
+      if (is_scatter_enabled)
+        {
+          skc_tile_aa_zero(smem);
+        }
+
+      do {
+        // skip scattering?
+        if (is_scatter_enabled)
+          {
+            skc_block_id_t const xb_id = skc_ttck_lo_get_ttxb_id(ttck.lo);
+
+            if (skc_ttck_lo_is_prefix(ttck.lo)) {
+              skc_scatter_ttpb(ttxb_extent,smem,xb_id);
+            } else {
+              skc_scatter_ttsb(ttxb_extent,smem,xb_id);
+            }
+          }
+
+        //
+        // any ttck keys left?
+        //
+        if (++ttck_idx >= ttck_count)
+          {
+            flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
+            break;
+          }
+
+        //
+        // process next ttck key
+        //
+#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
+        //
+        // SIMD -- read next key
+        //
+        ttck = ttck_keys[ttck_idx];
+#else
+        //
+        // SIMT -- refresh the ttck_s?
+        //
+        uint const ttck_lane_next = ttck_idx & SKC_RENDER_SUBGROUP_MASK;
+
+        if (ttck_lane_next == 0)
+          ttck_s = ttck_keys[min(ttck_idx+get_sub_group_local_id(),ttck_count-1)];
+
+        //
+        // broadcast next key to entire subgroup
+        //
+#if 0
+        ttck    = sub_group_broadcast(ttck_s,ttck_lane_next);    // SHOULD WORK BUT .4454 COMPILER IS BROKEN
+#else
+        ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane_next); // EXPLICIT WORKAROUND
+        ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane_next);
+#endif
+#endif
+        // continue scattering if on same YXL layer
+      } while (skc_ttck_equal_yxl(ttck0,ttck));
+
+      // finalize if no longer on same YX tile
+      if (!skc_ttck_hi_equal_yx(ttck0.hi,ttck.hi))
+        {
+          // otherwise, unwind the tile styling and exit
+          flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
+        }
+
+      //
+      // given: new layer id from ttxk key
+      //
+      // load [layer id]{ group id, depth }
+      //
+      // if within current group's layer range
+      //
+      //   if at same depth
+      //
+      //     load and execute cover>[mask>]color>blend commands
+      //
+      //   else if not at same depth then move deeper
+      //
+      //     for all groups in group trail from cur depth to new depth
+      //       enter group, saving and initializing regs as necessary
+      //     increment depth and update layer range
+      //     load and execute cover>[mask>]color>blend commands
+      //
+      // else not within layer range
+      //
+      //   exit current group, restoring regs as necessary
+      //   decrement depth and update layer range
+      //
+      //
+      skc_layer_id         const layer_id_new   = skc_ttck_get_layer(ttck0); // FIXME -- this was ttck_hi
+      union skc_layer_node const layer_node_new = layers[layer_id_new];
+
+      // clear flag that controls group/layer traversal
+      flags &= ~SKC_TILE_FLAGS_FLUSH_COMPLETE;
+
+      do {
+        bool const unwind = (flags & SKC_TILE_FLAGS_FLUSH_UNWIND) != 0;
+
+        //
+        // is layer a child of the current parent group?
+        //
+        uint cmd_next = 0;
+
+        if (!unwind && (layer_node_new.parent == group.id))
+          {
+            // execute this layer's cmds
+            cmd_next = layer_node_new.cmds;
+
+            // if this is final then configure so groups get unwound, otherwise we're done
+            flags   |= ((flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) ? SKC_TILE_FLAGS_FLUSH_UNWIND : SKC_TILE_FLAGS_FLUSH_COMPLETE);
+          }
+        else if (!unwind && (layer_id_new >= group.range.lo && layer_id_new <= group.range.hi))
+          {
+            //
+            // is layer in a child group?
+            //
+            union skc_group_parents const gp = groups[layer_node_new.parent].parents;
+            uint                    const gn = gp.depth - ++group.depth;
+
+            if (gn == 0)
+              group.id = layer_node_new.parent;
+            else
+              group.id = commands[gp.base + gn - 1].parent;
+
+            // update group layer range
+            group.range = groups[group.id].range;
+
+            // enter current group
+            cmd_next    = groups[group.id].cmds.enter;
+          }
+        else // otherwise, exit this group
+          {
+            // enter current group
+            cmd_next = groups[group.id].cmds.leave;
+
+            // decrement group depth
+            if (--group.depth == 0)
+              {
+                flags |= SKC_TILE_FLAGS_FLUSH_COMPLETE;
+              }
+            else
+              {
+                // get path_base of current group
+                uint const gnpb = groups[group.id].parents.base;
+
+                // get parent of current group
+                group.id    = commands[gnpb].parent;
+
+                // update group layer range
+                group.range = groups[group.id].range;
+              }
+          }
+
+        //
+        // execute cmds
+        //
+        while (true)
+          {
+            union skc_styling_cmd const cmd = commands[cmd_next++];
+
+            switch (cmd.u32 & SKC_STYLING_OPCODE_MASK_OPCODE)
+              {
+              case SKC_STYLING_OPCODE_NOOP:
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_NONZERO:
+                skc_tile_cover_nonzero(smem,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_EVENODD:
+                skc_tile_cover_evenodd(smem,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_ACCUMULATE:
+                skc_tile_cover_accumulate(&cover_acc,&cover_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_MASK:
+                skc_tile_cover_wip_mask(&cover_wip,&cover_msk);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_WIP_ZERO:
+                skc_tile_cover_wip_zero(&cover_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_ACC_ZERO:
+                skc_tile_cover_acc_zero(&cover_acc);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_MASK_ZERO:
+                skc_tile_cover_msk_zero(&cover_msk);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_MASK_ONE:
+                skc_tile_cover_msk_one(&cover_msk);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_MASK_INVERT:
+                skc_tile_cover_msk_invert(&cover_msk);
+                break;
+
+              case SKC_STYLING_OPCODE_COLOR_FILL_SOLID:
+                skc_tile_color_fill_solid(commands,&cmd_next,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR:
+                //
+                // FIXME -- gradients shouldn't be executing so much
+                // conditional driven code at runtime since we *know*
+                // the gradient style on the host can just create a
+                // new styling command to exploit this.
+                //
+                // FIXME -- it might be time to try using the GPU's
+                // sampler on a linear array of half4 vectors -- it
+                // might outperform the explicit load/lerp routines.
+                //
+                // FIXME -- optimizing for vertical gradients (uhhh,
+                // they're actually horizontal due to the -90 degree
+                // view transform) is nice but is it worthwhile to
+                // have this in the kernel?  Easy to add it back...
+                //
+#if defined( SKC_ARCH_GEN9 )
+                // disable gradients due to exessive spillage -- fix later
+                cmd_next += SKC_GRADIENT_CMD_WORDS_V1(commands[cmd_next+6].u32);
+#else
+                skc_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi);
+#endif
+                break;
+
+              case SKC_STYLING_OPCODE_COLOR_WIP_ZERO:
+                skc_tile_color_wip_zero(&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COLOR_ACC_ZERO:
+                skc_tile_color_acc_zero(&color_acc);
+                break;
+
+              case SKC_STYLING_OPCODE_BLEND_OVER:
+                skc_tile_blend_over(&color_acc,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_BLEND_PLUS:
+                skc_tile_blend_plus(&color_acc,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_BLEND_MULTIPLY:
+                skc_tile_blend_multiply(&color_acc,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_BLEND_KNOCKOUT:
+                skc_tile_blend_knockout(&cover_acc,&color_acc,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK:
+                // skc_tile_cover_msk_copy_wip(&cover_msk,&cover_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK:
+                // skc_tile_cover_msk_copy_acc(&cover_msk,&cover_acc);
+                break;
+
+              case SKC_STYLING_OPCODE_BACKGROUND_OVER:
+                skc_tile_background_over(commands,&cmd_next,&color_acc);
+                break;
+
+              case SKC_STYLING_OPCODE_SURFACE_COMPOSITE:
+#ifdef SKC_SURFACE_IS_BUFFER
+                skc_surface_composite_u8_rgba(surface,surface_pitch,&color_acc,ttck0.hi);
+#else
+                skc_surface_composite_u8_rgba(surface,              &color_acc,ttck0.hi);
+#endif
+                break;
+
+              case SKC_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY:
+                if (skc_tile_color_test_opacity(&color_acc))
+                  flags |= SKC_TILE_FLAGS_SCATTER_SKIP;
+                break;
+
+              default:
+                return; // this is an illegal opcode -- trap and die!
+              }
+
+            //
+            // if sign bit is set then this was final command
+            //
+            if (cmd.s32 < 0)
+              break;
+          }
+
+        // continue as long as tile flush isn't complete
+      } while ((flags & SKC_TILE_FLAGS_FLUSH_COMPLETE) == 0);
+
+      // return if was the final flush
+      if (flags & SKC_TILE_FLAGS_FLUSH_FINALIZE)
+        return;
+
+      // update wip ttck_hi
+      ttck0 = ttck;
+    }
+}
+
+//
+//
+//