1 files changed, 3366 insertions, 3366 deletions
diff --git a/src/compute/skc/platforms/cl_12/kernels/rasterize.cl b/src/compute/skc/platforms/cl_12/kernels/rasterize.cl
index e622845d9c..f20f6456b3 100644
--- a/src/compute/skc/platforms/cl_12/kernels/rasterize.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/rasterize.cl
@@ -1,3366 +1,3366 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "tile.h"
-#include "common.h"
-#include "atomic_cl.h"
-#include "block_pool_cl.h"
-#include "raster_builder_cl_12.h"
-#include "device_cl_12.h"
-
-// #define SKC_ARCH_AVX2
-// #define SKC_RASTERIZE_SIMD_USES_SMEM
-
-#define PRINTF_ENABLE       0
-#define PRINTF_BLOCK_COUNT  0
-
-//
-// NOTE:
-//
-// ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT
-// AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE
-//
-// NOTE:
-//
-// IGNORE FOR NOW ANY AVX2 CODE SNIPPETS.  THEY WILL BE MOVED ASAP.
-//
-//
-
-#if 0 // SKC_ARCH_AVX2
-
-// #define SKC_RASTERIZE_SUBGROUP_SIZE              1
-// #define SKC_RASTERIZE_VECTOR_SIZE_LOG2           3
-// #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP   1
-
-// #define SKC_TTXB_WORDS                           8
-
-// #define SKC_RASTERIZE_FLOAT                      float8
-// #define SKC_RASTERIZE_UINT                       uint8
-// #define SKC_RASTERIZE_INT                        int8
-// #define SKC_RASTERIZE_PREDICATE                  int8
-
-// #define SKC_RASTERIZE_BIN_BLOCK                  uint16
-// #define SKC_RASTERIZE_BIN                        uint8
-
-// #define SKC_RASTERIZE_POOL                       uint8
-// #define SKC_RASTERIZE_POOL_SCALE                 6
-
-// #define SKC_RASTERIZE_TILE_HASH_X_BITS           1
-// #define SKC_RASTERIZE_TILE_HASH_Y_BITS           2
-
-// #define SKC_RASTERIZE_VECTOR_EXPAND()            SKC_EXPAND_8()
-
-#endif
-
-//
-// SIMT
-//
-
-#define SKC_RASTERIZE_BLOCK_ID_V_SIZE        SKC_RASTERIZE_SUBGROUP_SIZE
-#define SKC_RASTERIZE_TTSK_V_SIZE            SKC_RASTERIZE_SUBGROUP_SIZE
-#define SKC_RASTERIZE_TTSK_V_MASK            (SKC_RASTERIZE_TTSK_V_SIZE - 1)
-
-//
-//
-//
-
-#define SKC_RASTERIZE_VECTOR_SIZE            (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2)
-#define SKC_RASTERIZE_ELEMS_PER_SUBGROUP     (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE)
-
-//
-//
-//
-
-#define SKC_RASTERIZE_YX_INIT                0x7FFF7FFF  // { +32767, +32767 }
-#define SKC_RASTERIZE_YX_INVALID             0x80008000  // { -32768, -32768 }
-
-//
-//
-//
-
-#define SKC_RASTERIZE_TILE_HASH_X_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS)
-#define SKC_RASTERIZE_TILE_HASH_Y_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS)
-#define SKC_RASTERIZE_TILE_HASH_BITS         (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS)
-#define SKC_RASTERIZE_TILE_HASH_BIN_COUNT    (1 << SKC_RASTERIZE_TILE_HASH_BITS)
-#define SKC_RASTERIZE_TILE_HASH_BIN_BITS     (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT)
-#define SKC_RASTERIZE_TILE_HASH_BIN_MASK     SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS)
-
-//
-// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
-//
-// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
-//
-// Lerp in two fma/mad ops:
-//
-//    t * b + ((-t) * a + a)
-//
-// Note: OpenCL documents mix() as being implemented as:
-//
-//    a + (b - a) * t
-//
-// But this may be a native instruction on some devices. For example,
-// on GEN9 there is an LRP "linear interoplation" opcode but it
-// doesn't appear to support half floats.
-//
-// Feel free to toggle this option and then benchmark and inspect the
-// generated code.  We really want the double FMA to be generated when
-// there isn't support for a LERP/MIX operation.
-//
-
-#if 1
-#define SKC_LERP(a,b,t)      mad(t,b,mad(-(t),a,a))
-#else
-#define SKC_LERP(a,b,t)      mix(a,b,t)
-#endif
-
-//
-// There is no integer MAD in OpenCL with "don't care" overflow
-// semantics.
-//
-// FIXME -- verify if the platform needs explicit MAD operations even
-// if a "--fastmath" option is available at compile time.  It might
-// make sense to explicitly use MAD calls if the platform requires it.
-//
-
-#if 1
-#define SKC_MAD_UINT(a,b,c)  ((a) * (b) + (c))
-#else
-#define SKC_MAD_UINT(a,b,c)  mad_sat(a,b,c)
-#endif
-
-//
-//
-//
-
-#define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane())
-
-//
-//
-//
-
-union skc_bp_elem
-{
-  skc_uint              u32;
-  skc_tagged_block_id_t tag_id;
-  skc_float             coord;
-};
-
-//
-//
-//
-
-struct skc_subgroup_smem
-{
-  //
-  // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member
-  //
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM )
-  struct {
-    union {
-
-      skc_uint                winner;
-
-      struct {
-        skc_uint              scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
-      } aN;
-
-      struct {
-        SKC_RASTERIZE_UINT    scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
-      } vN;
-    };
-  } subgroup;
-#endif
-
-  //
-  // work-in-progress TTSB blocks and associated YX keys
-  //
-  union {
-    struct {
-      // FIXME -- some typedefs are valid here
-      skc_uint                ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS];
-      skc_uint                yx   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
-      skc_uint                id   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
-      skc_uint                count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
-    } aN;
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-    struct {
-      SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
-      SKC_RASTERIZE_BIN       yx;
-      SKC_RASTERIZE_BIN       id;
-      SKC_RASTERIZE_BIN       count;
-    } vN;
-#endif
-  } bin;
-};
-
-//
-//
-//
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-#define skc_subgroup_lane()  0
-#else
-#define skc_subgroup_lane()  get_sub_group_local_id()
-#endif
-
-//
-// replenish block ids
-//
-// note that you can't overrun the block id pool since it's a ring
-//
-
-static
-void
-skc_blocks_replenish(skc_uint                           * const blocks_next,
-                     skc_block_id_v_t                   * const blocks,
-                     __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
-                     skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
-                     __global skc_block_id_t   const    * const bp_ids)
-{
-  //
-  // get a new vector of block ids -- this is kind of a narrow
-  // allocation but subblocks help stretch out the pool.
-  //
-  // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids
-  //
-  skc_uint bp_idx = 0;
-
-  if (skc_subgroup_lane() == 0)
-    {
-      bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,
-                                                    SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads
-#if 0
-      printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE);
-#endif
-    }
-
-  bp_idx       = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask;
-  *blocks      = bp_ids[bp_idx];
-  *blocks_next = 0;
-}
-
-//
-//
-//
-
-static
-skc_block_id_t
-skc_blocks_get_next(skc_uint                           * const blocks_next,
-                    skc_block_id_v_t                   * const blocks,
-                    __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
-                    skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
-                    __global skc_block_id_t   const    * const bp_ids)
-{
-  // replenish?
-  if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE)
-    {
-      skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
-    }
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 )
-  //
-  // SIMT
-  //
-  skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);
-
-#else
-  //
-  // SIMD
-  //
-  skc_block_id_t id = blocks->s0;
-
-  skc_shuffle_down_1(*blocks);
-
-#endif
-
-  *blocks_next += 1;
-
-  return id;
-}
-
-//
-// subblock allocator
-//
-
-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
-
-static
-skc_block_id_t
-skc_subblocks_get_next(skc_block_id_t                     * const subblocks,
-                       skc_uint                           * const blocks_next,
-                       skc_block_id_v_t                   * const blocks,
-                       __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
-                       skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
-                       __global skc_block_id_t   const    * const bp_ids)
-{
-  if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
-    {
-      *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
-    }
-
-  skc_block_id_t const sb_id = *subblocks;
-
-  *subblocks += 1;
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("= %u\n",sb_id);
-#endif
-
-  return sb_id;
-}
-
-
-#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks
-#define SKC_SUBBLOCKS_BLOCKS_ARGS()  subblocks, blocks
-
-#else
-
-#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks
-#define SKC_SUBBLOCKS_BLOCKS_ARGS()  blocks
-
-#endif
-
-//
-//
-//
-
-static
-skc_block_id_t
-skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(),
-                  skc_uint                           * const blocks_next,
-                  __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
-                  skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
-                  __global skc_block_id_t   const    * const bp_ids,
-                  __global SKC_ATOMIC_UINT  volatile * const cohort_atomics,
-                  skc_ttsk_v_t                       * const sk_v,
-                  skc_uint                           * const sk_v_next,
-                  __global skc_ttsk_s_t              * const sk_extent,
-                  skc_uint                             const new_yx)
-{
-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
-  skc_block_id_t const new_id = skc_subblocks_get_next(subblocks,
-                                                       blocks_next,
-                                                       blocks,
-                                                       bp_atomics,
-                                                       bp_mask,
-                                                       bp_ids);
-#else
-  skc_block_id_t const new_id = skc_blocks_get_next(blocks_next,
-                                                    blocks,
-                                                    bp_atomics,
-                                                    bp_mask, // pow2 modulo mask for block pool ring
-                                                    bp_ids);
-#endif
-
-  if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK))
-    {
-      sk_v->lo = new_id;
-      sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx;
-#if 0
-      printf("@ ( %3u, %3u ) %u\n",
-             (new_yx >> 12) & 0xFFF,
-             (new_yx      ) & 0xFFF,
-             new_id);
-#endif
-    }
-
-  *sk_v_next += 1;
-
-  if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE)
-    {
-      *sk_v_next = 0;
-
-      skc_uint sk_idx = 0;
-
-      if (skc_subgroup_lane() == 0)
-        {
-          sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
-            (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE);
-#if 0
-          printf("+ %u\n",sk_idx);
-#endif
-        }
-
-      sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE )
-      if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE)
-#endif
-        {
-          sk_extent[sk_idx] = *sk_v;
-#if 0
-          printf("> %u : %v2u\n",sk_idx,*sk_v);
-#endif
-        }
-    }
-
-  return new_id;
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-  // Note that there isn't a built-in horizontal scan for vectors so
-  // we'll define some here for various widths.
-  //
-  // FIXME -- a scalar version might be faster so put in a
-  // compile-time switch to selection between implementations
-  //
-
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return v;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
-  // 01
-  //  0 +
-  // --
-  // 01
-  SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v);
-  return w;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
-  // 0123
-  //  012 +
-  // ----
-  // 0123
-  //   01 +
-  // ----
-  // 0123
-  //
-  SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v);
-  SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w);
-  return x;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
-  // 01234567
-  //  0123456 +
-  // --------
-  // 01234567
-  //   012345 +
-  // --------
-  // 01234567
-  //     0123 +
-  // --------
-  // 01234567
-  //
-  SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v);
-  SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w);
-  SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x);
-  return y;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
-  // 0123456789abcdef
-  //  0123456789abcde +
-  // ----------------
-  // 0123456789abcdef
-  //   0123456789abcd +
-  // ----------------
-  // 0123456789abcdef
-  //     0123456789ab +
-  // ----------------
-  // 0123456789abcdef
-  //         01234567 +
-  // ----------------
-  // 0123456789abcdef
-  //
-  SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
-  SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
-  SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
-  SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
-  return z;
-
-#endif
-
-#else
-  //
-  // SIMT
-  //
-
-  return sub_group_scan_inclusive_add(v);
-
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_UINT
-skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-  // Note that there isn't a built-in horizontal scan for vectors so
-  // we'll define some here for various widths.
-  //
-  // FIXME -- a scalar version might be faster so put in a
-  // compile-time switch to selection between implementations
-  //
-
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return v;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
-  // 01
-  //  0 +
-  // --
-  // 01
-  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v);
-  return w;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
-  // 0123
-  //  012 +
-  // ----
-  // 0123
-  //   01 +
-  // ----
-  // 0123
-  //
-  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v);
-  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w);
-  return x;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
-  // 01234567
-  //  0123456 +
-  // --------
-  // 01234567
-  //   012345 +
-  // --------
-  // 01234567
-  //     0123 +
-  // --------
-  // 01234567
-  //
-  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v);
-  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w);
-  SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x);
-  return y;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
-  // 0123456789abcdef
-  //  0123456789abcde +
-  // ----------------
-  // 0123456789abcdef
-  //   0123456789abcd +
-  // ----------------
-  // 0123456789abcdef
-  //     0123456789ab +
-  // ----------------
-  // 0123456789abcdef
-  //         01234567 +
-  // ----------------
-  // 0123456789abcdef
-  //
-  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
-  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
-  SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
-  SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
-  return z;
-
-#endif
-
-#else
-  //
-  // SIMT
-  //
-
-  return sub_group_scan_inclusive_add(v);
-
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_UINT
-skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-  // Note that there isn't a built-in horizontal scan for vectors so
-  // we'll define some here for various widths.
-  //
-  // FIXME -- a scalar version might be faster so put in a
-  // compile-time switch to selection between implementations
-  //
-
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return v;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
-  // 01
-  // 00 max
-  // --
-  // 01
-  SKC_RASTERIZE_UINT const w = max(v.s00,v);
-  return w;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
-  // 0123
-  // 0012 +
-  // ----
-  // 0123
-  // 0101 +
-  // ----
-  // 0123
-  //
-  SKC_RASTERIZE_UINT const w = max(v.s0012,v);
-  SKC_RASTERIZE_UINT const x = max(w.s0101,w);
-  return x;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
-  // 01234567
-  // 00123456 +
-  // --------
-  // 01234567
-  // 01012345 +
-  // --------
-  // 01234567
-  // 01230123 +
-  // --------
-  // 01234567
-  //
-  SKC_RASTERIZE_UINT const w = max(v.s00123456,v);
-  SKC_RASTERIZE_UINT const x = max(w.s01012345,w);
-  SKC_RASTERIZE_UINT const y = max(x.s01230123,x);
-  return y;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
-  // 0123456789abcdef
-  // 00123456789abcde +
-  // ----------------
-  // 0123456789abcdef
-  // 010123456789abcd +
-  // ----------------
-  // 0123456789abcdef
-  // 01230123456789ab +
-  // ----------------
-  // 0123456789abcdef
-  // 0123456701234567 +
-  // ----------------
-  // 0123456789abcdef
-  //
-  SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v);
-  SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w);
-  SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x);
-  SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y);
-  return z;
-
-#endif
-
-#else
-  //
-  // SIMT
-  //
-
-  return sub_group_scan_inclusive_max(v);
-
-#endif
-}
-
-//
-//
-//
-
-static
-float
-skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return v;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
-  return v.s1;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
-  return v.s3;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
-  return v.s7;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
-  return v.sf;
-#endif
-
-#else
-  //
-  // SIMT
-  //
-  return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);
-
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_UINT
-skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return v;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
-  return v.s1;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
-  return v.s3;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
-  return v.s7;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
-  return v.sf;
-#endif
-
-#else
-  //
-  // SIMT
-  //
-  return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);
-
-#endif
-}
-
-//
-//
-//
-
-static
-float
-skc_subgroup_first(SKC_RASTERIZE_FLOAT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return v;
-#else
-  return v.s0;
-#endif
-
-#else
-  //
-  // SIMT
-  //
-  return sub_group_broadcast(v,0);
-
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v,
-                      SKC_RASTERIZE_UINT  const i)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return v;
-#else
-  return shuffle(v,i);
-#endif
-
-#else
-  //
-  // SIMT
-  //
-  return intel_sub_group_shuffle(v,i);
-
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous
-                          SKC_RASTERIZE_FLOAT const c) // current
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-  // FIXME -- there are alternative formulations here:
-  //
-  // Option 1:
-  //
-  //   select(c.rotate(+1),p.rotate(-1),(1,0,0,...))
-  //
-  // Option 2:
-  //
-  //   p is a scalar
-  //   t    = c.rotate(+1)
-  //   t.s0 = p;
-  //
-  // Option 3: ...
-  //
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return p;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
-  return shuffle2(p,c,(uint2)(1,2));
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
-  return shuffle2(p,c,(uint4)(3,4,5,6));
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
-  return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14));
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
-  return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30));
-#endif
-
-#else
-  //
-  // SIMT
-  //
-  return intel_sub_group_shuffle_up(p,c,1);
-
-#endif
-}
-
-//
-//
-//
-
-static
-bool
-skc_is_lane_first()
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
-  //
-  // SIMD
-  //
-  return true;
-#else
-  //
-  // SIMT
-  //
-  return get_sub_group_local_id() == 0;
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_delta_offset()
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-  return 1;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
-  return (SKC_RASTERIZE_FLOAT)( 1, 2 );
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
-  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 );
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
-  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 );
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
-  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 );
-#endif
-
-#else
-  //
-  // SIMT
-  //
-  return 1.0f + get_sub_group_local_id();
-
-#endif
-
-}
-
-//
-//
-//
-
-static
-int
-skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-  return any(p);
-#else
-  //
-  // SIMT
-  //
-  return sub_group_any(p);
-#endif
-}
-
-//
-//
-//
-
-#define SKC_PATH_NODEWORD_IS_LAST(n)  (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK)
-
-void
-skc_segment_next(__global union skc_bp_elem * const bp_elems,
-                 skc_uint                   * const nodeword,
-                 skc_block_id_t             * const id)
-{
-  if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
-    {
-      if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword))
-        {
-          *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS;
-        }
-
-      skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id;
-
-      *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
-    }
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y)
-{
-  return native_sqrt(x * x + y * y);
-}
-
-//
-// Wang's Formula (1985)
-//
-
-#define SKC_WANG_PIXEL_RESL   0.25f // <-- this can be tuned
-
-#define SKC_WANG_EPSILON      (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32)
-
-#define SKC_WANG_CUBIC        ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON))
-#define SKC_WANG_QUADRATIC    ((2.0f       ) / (8.0f * SKC_WANG_EPSILON))
-
-#define SKC_WANG_LENGTH(x,y)  skc_native_length(x,y)
-#define SKC_WANG_SQRT(x)      native_sqrt(x)
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
-                        SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
-                        SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y,
-                        SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y)
-{
-  //
-  // Return the number of evenly spaced (in the parametric sense) line
-  // segments that are guaranteed to be within "epsilon" error of the
-  // curve.
-  //
-  // We're then going to take multiples of the reciprocal of this
-  // number so that the segmentation can be distributed across the
-  // subgroup.
-  //
-  // Note, this can probably be slightly optimized per architecture
-  // but it's probably far from being a hotspot since it's all
-  // straight-line unpredicated code.
-  //
-  // The result is an integer ranging from [1.0,#segments]
-  //
-  // Note that even if all of the control points are coincident, the
-  // max(1.0f) will categorize this as a line of 1 segment.
-  //
-  // This is what we want!  We want to convert cubics to lines as
-  // easily as possible and *then* cull lines that are either
-  // horizontal or zero length.
-  //
-  return max(1.0f,
-             ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC *
-                                SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x),
-                                                    fabs(t3x - 2.0f * t2x + t1x)),
-                                                max(fabs(t2y - 2.0f * t1y + t0y),
-                                                    fabs(t3y - 2.0f * t2y + t1y))))));
-}
-
-static
-SKC_RASTERIZE_FLOAT
-skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
-                            SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
-                            SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y)
-{
-  return max(1.0f,
-             ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC *
-                                SKC_WANG_LENGTH(fabs(t2x - 2.0f * t1x + t0x),
-                                                fabs(t2y - 2.0f * t1y + t0y)))));
-}
-
-//
-// rational curves
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_wangs_formula_cubic_rat()
-{
-  return 0.0f;
-}
-
-static
-SKC_RASTERIZE_FLOAT
-skc_wangs_formula_quad_rat()
-{
-  return 0.0f;
-}
-
-//
-// flush any work-in-progress blocks and return unused block ids
-//
-
-static
-void
-skc_finalize(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
-             __global union skc_bp_elem                 * const bp_elems,
-             __global uint                              * const bp_ids,
-             skc_uint                                     const bp_mask,
-             __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
-             skc_block_id_v_t                           * const blocks,
-             skc_uint                                     const blocks_next,
-             skc_ttsk_v_t                               * const sk_v,
-             skc_uint                                     const sk_v_next,
-             __global skc_ttsk_s_t                      * const sk_extent,
-             __local  struct skc_subgroup_smem volatile * const smem)
-{
-  //
-  // flush non-empty bins
-  //
-  // FIXME -- accelerate this iteration/search with a subgroup operation
-  //
-  for (skc_uint ii=0; ii<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; ii++)
-    {
-      if (smem->bin.aN.count[ii] > 0)
-        {
-          skc_block_id_v_t const id  = smem->bin.aN.id[ii];
-          skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
-          skc_uint         const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()];
-#if 0
-          printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts);
-#endif
-          bp_elems[idx].u32 = tts;
-        }
-
-      //
-      // FIXME -- vectorize with vstoreN()
-      //
-    }
-
-  //
-  // return remaining block ids back to the pool
-  //
-  skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next;
-
-  if (blocks_rem > 0)
-    {
-      skc_uint bp_idx = 0;
-
-      if (skc_subgroup_lane() == 0)
-        {
-          bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem);
-
-#if 0
-          printf("r-: %8u + %u\n",bp_idx,blocks_rem);
-#endif
-        }
-
-      bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask;
-
-      if (skc_subgroup_lane() >= blocks_next)
-        {
-          bp_ids[bp_idx] = *blocks;
-        }
-    }
-
-  //
-  // flush work-in-progress ryx keys
-  //
-  if (sk_v_next > 0)
-    {
-      skc_uint sk_idx = 0;
-
-      if (skc_subgroup_lane() == 0)
-        {
-          sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
-            (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next);
-#if 0
-          printf("* %u\n",sk_idx);
-#endif
-        }
-
-      sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();
-
-      if (skc_subgroup_lane() < sk_v_next)
-        {
-          sk_extent[sk_idx] = *sk_v;
-        }
-    }
-}
-
-//
-// If there are lanes that were unable to append to a bin because
-// their hashes collided with a bin's current ryx key then those bins
-// must be ejected.
-//
-// Note that we do not eject "full" bins because lazily waiting for a
-// collision results in simpler code.
-//
-
-static
-void
-skc_flush(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
-          __global union skc_bp_elem                 * const bp_elems,
-          __global uint                              * const bp_ids,
-          skc_uint                                     const bp_mask,
-          __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
-          skc_block_id_t                             * const subblocks,
-          skc_block_id_v_t                           * const blocks,
-          skc_uint                                   * const blocks_next,
-          skc_ttsk_v_t                               * const sk_v,
-          skc_uint                                   * const sk_v_next,
-          __global skc_ttsk_s_t                      * const sk_extent,
-          __local  struct skc_subgroup_smem volatile * const smem,
-          SKC_RASTERIZE_UINT                           const hash,
-          SKC_RASTERIZE_UINT                           const yx,
-          SKC_RASTERIZE_PREDICATE                            is_collision) // pass by value
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-
-  //
-  // FIXME -- this code is now stale with the changes to the
-  // subblock/block allocation strategy
-  //
-
-  //
-  // get local TTSB ID queue count
-  //
-  skc_uint ttsb_id_count  = smem->pool.count; // scalar
-
-  // init hash bit mask
-  skc_uint component_mask = 0;
-
-  for (int cc=0; cc<SKC_RASTERIZE_VECTOR_SIZE; cc++)
-    {
-      // if no collision continue
-      if (((int*)&is_collision)[cc] == 0)
-        continue;
-
-      uint const winner        = ((uint*)&hash)[cc];
-      uint const component_bit = 1u << winner;
-
-      // if already processed this hash then continue
-      if (component_mask & component_bit)
-        continue;
-
-      // update component mask
-      component_mask |= component_bit;
-
-      //
-      // new winner requires ejecting the old TTSB
-      //
-      if (smem->bin.aN.count[winner] > 0)
-        {
-          skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
-
-          bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
-        }
-
-        //
-        // ensure there is at least one TTSK and TTSB ID
-        //
-        if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE)
-          {
-            //
-            // update remaining count
-            //
-            ttsb_id_count = 0;
-
-            //
-            // flush accumulated ttsk_ryx keys
-            //
-            uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
-              (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count
-
-#if 0
-            printf("# %u\n",idx);
-#endif
-
-            for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
-              {
-                ttsk_ryx[idx + ii] = skc_make_ttsk_ryx(smem,SKC_CMD_RASTERIZE_GET_COHORT(cmd),ii);
-              }
-
-            //
-            // allocate more ttsb ids from pool
-            //
-            uint const id = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+0,SKC_RASTERIZE_POOL_SIZE); // ring_reads
-
-            for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
-              smem->pool.aN.id[ii] = bp_ids[id + ii];
-          }
-
-      //
-      // invalidate the winning block
-      //
-
-      //
-      // update bin with winning yx, new ttsb id and zero count
-      //
-      // all lanes are loading/storing from/to the same index
-      //
-      smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID );
-      smem->bin.aN.id   [winner] = smem->pool.aN.id[ttsb_id_count];
-      smem->bin.aN.yx   [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc];
-      smem->bin.aN.count[winner] = 0;
-
-      //
-      // update count
-      //
-      ttsb_id_count += 1;
-    }
-
-  //
-  // save count
-  //
-  smem->pool.count = ttsb_id_count;
-
-#else
-  //
-  // SIMT
-  //
-
-  do {
-    //
-    // only one lane will win!
-    //
-    if (is_collision)
-      smem->subgroup.winner = hash;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    //
-    // which bin is being ejected?
-    //
-    skc_uint const winner = smem->subgroup.winner;
-
-    //
-    // which colliding hash is taking over the bin?
-    //
-    SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner);
-
-    //
-    // all lanes with the same hash will try to store but only one
-    // lane will win
-    //
-    if (is_winner)
-      smem->subgroup.winner = yx;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    //
-    // flush this block to the pool
-    //
-    if (smem->bin.aN.count[winner] > 0)
-      {
-        skc_block_id_v_t const id  = smem->bin.aN.id[winner];
-        skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
-        skc_uint         const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
-#if 0
-        printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts);
-#endif
-        bp_elems[idx].u32 = tts;
-      }
-
-    //
-    // append new ttsk
-    //
-    skc_uint       const new_yx = smem->subgroup.winner;
-    skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(),
-                                                    blocks_next,
-                                                    bp_atomics,
-                                                    bp_mask, // pow2 modulo mask for block pool ring
-                                                    bp_ids,
-                                                    cohort_atomics,
-                                                    sk_v,
-                                                    sk_v_next,
-                                                    sk_extent,
-                                                    new_yx);
-
-#if 0
-    if (get_sub_group_local_id() == 0) {
-      printf(">>> %9u\n",new_id);
-    }
-#endif
-
-    //
-    // update bin with winning yx, new ttsb id and zero count
-    //
-    smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID;
-    smem->bin.aN.yx   [winner]                      = new_yx;
-    smem->bin.aN.id   [winner]                      = new_id;
-    smem->bin.aN.count[winner]                      = 0;
-
-    //
-    // remove all lanes matching this hash
-    //
-    is_collision = is_collision && !is_winner;
-
-    //
-    // exit if nothing left to do
-    //
-  } while (sub_group_any(is_collision));
-
-#endif
-}
-
-//
-// scatter scan max
-//
-static
-SKC_RASTERIZE_UINT
-skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem,
-                     SKC_RASTERIZE_FLOAT                         const iss,
-                     SKC_RASTERIZE_FLOAT                         const ess)
-{
-  //
-  // prefix sums determine which lanes we're going to work on next
-  //
-  SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP);
-  SKC_RASTERIZE_UINT      const scratch_idx      = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f));
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-#ifdef SKC_RASTERIZE_SIMD_USES_SMEM
-  //
-  // SIMD APPROACH 1: SIMT'ISH
-  //
-
-  // zero the volatile smem scratchpad using vector syntax
-  smem->subgroup.vN.scratch[0] = ( 0 );
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A)                         \
-  if (is_scratch_store C)                               \
-    smem->subgroup.aN.scratch[scratch_idx C] = I;
-
-  SKC_RASTERIZE_VECTOR_EXPAND();
-
-  // propagate lanes to right using max scan
-  SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0];
-  SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);
-
-#else
-  //
-  // SIMD APPROACH 2: SCALAR'ISH
-  //
-
-  SKC_RASTERIZE_UINT source = ( 0 );
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A)                 \
-  if (is_scratch_store C)                       \
-    ((uint *)&source)[scratch_idx C] = I;
-
-  SKC_RASTERIZE_VECTOR_EXPAND();
-
-  for (uint ii=1; ii<SKC_RASTERIZE_ELEMS_PER_SUBGROUP; ii++)
-    ((uint *)&source)[ii] = max(((uint *)&source)[ii-1],((uint *)&source)[ii]);
-#endif
-
-#else
-  //
-  // SIMT
-  //
-
-  //
-  // zero the volatile smem scratchpad using vector syntax
-  //
-  smem->subgroup.vN.scratch[skc_subgroup_lane()] = ( 0 );
-
-  //
-  // store source lane at starting lane
-  //
-  if (is_scratch_store)
-    smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane();
-
-  //
-  // propagate lanes to right using max scan
-  //
-  SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()];
-  SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);
-#endif
-
-  return source;
-}
-
-//
-// sliver lines into subpixels
-//
-
-static
-void
-skc_sliver(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
-           __global union skc_bp_elem                 * const bp_elems,
-           __global uint                              * const bp_ids,
-           skc_uint                                     const bp_mask,
-           __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
-           skc_block_id_t                             * const subblocks,
-           skc_block_id_v_t                           * const blocks,
-           skc_uint                                   * const blocks_next,
-           skc_ttsk_v_t                               * const sk_v,
-           skc_uint                                   * const sk_v_next,
-           __global skc_ttsk_s_t                      * const sk_extent,
-           __local  struct skc_subgroup_smem volatile * const smem,
-           SKC_RASTERIZE_FLOAT                          const l0x,
-           SKC_RASTERIZE_FLOAT                          const l0y,
-           SKC_RASTERIZE_FLOAT                          const l1x,
-           SKC_RASTERIZE_FLOAT                          const l1y)
-{
-  //
-  // Y-SLIVERING
-  // -----------
-  //
-  // immediately sliver all multi-pixel lines in into 1-pixel high
-  // lines
-  //
-  // note this implicitly squelches horizontal lines
-  //
-  // there is another test for horizontal lines after x-slivering
-  // is complete
-  //
-
-  //
-  // will we need to flip the sign of y_delta ?
-  //
-  SKC_RASTERIZE_PREDICATE const y_lt   = (l0y <= l1y);
-  SKC_RASTERIZE_UINT      const dy_xor = y_lt ? 0 : 0x80000000;
-
-  //
-  // save 1/dy
-  //
-  SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y);
-
-  //
-  // how many non-horizontal subpixel y-axis slivers are there?
-  //
-  SKC_RASTERIZE_FLOAT const y_min   = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
-  SKC_RASTERIZE_FLOAT const y_max   = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
-  SKC_RASTERIZE_FLOAT const y_base  = y_lt ? y_min : y_max;
-  SKC_RASTERIZE_FLOAT       y_segs  = y_max - y_min;
-
-  //
-  // inclusive subgroup scan of y_segs
-  //
-  SKC_RASTERIZE_FLOAT       y_iss   = skc_subgroup_scan_inclusive_add_float(y_segs);
-  SKC_RASTERIZE_FLOAT       y_ess   = y_iss - y_segs;
-  float                     y_rem   = skc_subgroup_last_float(y_iss);
-
-  //
-  // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails
-  //
-  if (y_segs == 0.0f)
-    y_iss = 0.0f;
-
-#if 0
-  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem);
-#endif
-
-  //
-  // these values don't matter on first iteration
-  //
-  SKC_RASTERIZE_FLOAT n1x_prev = 0;
-  SKC_RASTERIZE_FLOAT n1y_prev = 0;
-
-  //
-  // loop until done
-  //
-  while (y_rem > 0.0f)
-    {
-      //
-      // distribute work across lanes
-      //
-      SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess);
-
-      //
-      // get line at y_source line
-      //
-      SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source);
-      SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source);
-      SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source);
-      SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source);
-
-      //
-      // every lane will create a 1 pixel tall line "sliver"
-      //
-      // FIXME -- this gets expanded on SIMD
-      //
-      // if numerator == 1 then this is the first lane
-      // if numerator == s then this is the last  lane
-      //
-      SKC_RASTERIZE_FLOAT     const y_delta    = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source);
-      SKC_RASTERIZE_FLOAT     const y_count    = skc_subgroup_shuffle(y_segs,y_source);
-
-      SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f);
-      SKC_RASTERIZE_PREDICATE const is_y_last  = (y_delta >= y_count);
-
-      // toggle y_delta sign
-      SKC_RASTERIZE_FLOAT     const y_offset   = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source)));
-
-      //
-      // calculate "right" line segment endpoint
-      //
-      SKC_RASTERIZE_FLOAT       n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP;
-      SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source);
-      SKC_RASTERIZE_FLOAT       n1x = round(SKC_LERP(m0x,m1x,n_t));
-
-      //
-      // override c1 if this is last point
-      //
-      n1y = select(n1y,m1y,is_y_last);
-      n1x = select(n1x,m1x,is_y_last);
-
-      //
-      // shuffle up "left" line segment endpoint
-      //
-      // NOTE: Intel's shuffle_up is unique with its elegant
-      // "previous" argument so don't get used to it
-      //
-      SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y);
-      SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x);
-
-      //
-      // override shuffle up if this is the first line segment
-      //
-      n0y = select(n0y,m0y,is_y_first);
-      n0x = select(n0x,m0x,is_y_first);
-
-      //
-      // save previous right endpoint
-      //
-      n1x_prev = n1x;
-      n1y_prev = n1y;
-
-      //
-      // decrement by subgroup size
-      //
-      y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-      y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-      y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-
-#if 0
-      //
-      // debug
-      //
-      if (n0y != n1y) {
-        printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y);
-      }
-#endif
-
-      //
-      // X-SLIVERING
-      // -----------
-      //
-      // now sliver 1-pixel high lines into at either vertical or
-      // 1-pixel wide lines
-      //
-      // save original direction and work with increasing x
-      //
-      SKC_RASTERIZE_PREDICATE const x_lt   = (n0x <= n1x);
-      SKC_RASTERIZE_UINT      const dx_xor = x_lt ? 0 : 0x80000000;
-
-      //
-      // save 1/dy
-      //
-      SKC_RASTERIZE_FLOAT const x_denom  = native_recip(n1x - n0x);
-
-      //
-      // how many non-horizontal subpixel y-axis slivers are there?
-      //
-      SKC_RASTERIZE_FLOAT const x_min    = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
-      SKC_RASTERIZE_FLOAT const x_max    = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
-      SKC_RASTERIZE_FLOAT const x_base   = x_lt ? x_min : x_max;
-      SKC_RASTERIZE_FLOAT const x_segs   = fmax(x_max - x_min,1.0f);
-
-      //
-      // inclusive subgroup scan of y_segs
-      //
-      SKC_RASTERIZE_FLOAT       x_iss    = skc_subgroup_scan_inclusive_add_float(x_segs);
-      SKC_RASTERIZE_FLOAT       x_ess    = x_iss - x_segs;
-      float                     x_rem    = skc_subgroup_last_float(x_iss);
-
-      //
-      // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails
-      //
-      //if (x_segs == 0.0f)
-      // x_iss = 0.0f;
-
-      //
-      // these values don't matter on first iteration
-      //
-      SKC_RASTERIZE_FLOAT       p1x_prev = 0;
-      SKC_RASTERIZE_FLOAT       p1y_prev = 0;
-
-      //
-      // loop until done
-      //
-      while (x_rem > 0)
-        {
-          //
-          // distribute work across lanes
-          //
-          SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess);
-
-          //
-          // get line at y_source line
-          //
-          SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source);
-          SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source);
-          SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source);
-          SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source);
-
-          //
-          // every lane will create a 1 pixel tall line "sliver"
-          //
-          // FIXME -- this gets expanded on SIMD
-          //
-          // if numerator == 1 then this is the first lane
-          // if numerator == s then this is the last  lane
-          //
-          SKC_RASTERIZE_FLOAT     const x_delta    = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source);
-          SKC_RASTERIZE_FLOAT     const x_count    = skc_subgroup_shuffle(x_segs,x_source);
-
-          SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f);
-          SKC_RASTERIZE_PREDICATE const is_x_last  = (x_delta >= x_count);
-
-          // toggle x_delta sign
-          SKC_RASTERIZE_FLOAT     const x_offset   = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source)));
-
-          //
-          // calculate "right" line segment endpoint
-          //
-          SKC_RASTERIZE_FLOAT       p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP;
-          SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source);
-          SKC_RASTERIZE_FLOAT       p1y = round(SKC_LERP(o0y,o1y,p_t));
-
-          //
-          // override c1 if this is last point
-          //
-          p1x = select(p1x,o1x,is_x_last);
-          p1y = select(p1y,o1y,is_x_last);
-
-          //
-          // shuffle up "left" line segment endpoint
-          //
-          // NOTE: Intel's shuffle_up is unique with its elegant
-          // "previous" argument so don't get used to it
-          //
-          SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x);
-          SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y);
-
-          //
-          // override shuffle up if this is the first line segment
-          //
-          p0x = select(p0x,o0x,is_x_first);
-          p0y = select(p0y,o0y,is_x_first);
-
-          //
-          // save previous right endpoint
-          //
-          p1x_prev = p1x;
-          p1y_prev = p1y;
-
-          //
-          // decrement by subgroup size
-          //
-          x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-          x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-          x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-
-          //
-          // only non-horizontal subpixel lines are valid
-          //
-          SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y);
-
-          //
-          // if no lanes are active then continue
-          //
-          // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY
-          // IMPACTS PERFORMANCE (+12% ?)
-          //
-          // IT SHOULDN'T !!!
-          //
-#if 0
-          if (!skc_subgroup_any(is_active))
-            continue;
-#endif
-
-          //
-          // Option 1: use SLM for explicitly managed coalesced stores
-          //
-          // 1. which tile does this line belong?
-          // 2. hash tile coordinates
-          // 3. lookup hash
-          // 4. if tile matches then SLM append keys
-          // 5. if tile doesn't match
-          //   a. flush
-          //   b. create new TTSK_RYX
-          //   c. obtain TTSB block from pool
-          //   d. goto 3.
-          //
-
-          //
-          // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores
-          //
-          // 1. which tile does this line belong?
-          // 2. hash tile coordinates
-          // 3. lookup hash
-          // 4. if tile matches then GMEM append keys
-          // 5. if tile doesn't match
-          //   a. flush (and invalidate empty elems)
-          //   b. create new TTSK_RYX
-          //   c. obtain TTSB block from pool
-          //   d. goto 3.
-          //
-
-          //
-          // The virtual rasterization surface is very large and
-          // signed: +/- ~64K-256K, depending on the architecture.
-          //
-          // Rasters must be clipped to the virtual surface and,
-          // optionally, clipped even further on a per raster
-          // basis.
-          //
-
-          //
-          // Clip to the per-raster clip
-          //
-
-          /*
-
-            CLIP HERE
-
-          */
-
-          //
-          // Hash the tile coordinates
-          //
-          // This table lists nominal values for each architecture.
-          // We want to choose values that are naturally fit the
-          // "width" of the architecture.
-          //
-          //   SIMD   RANGE   BITS  MAX RANGE  MAX BINS  HASH BITS
-          //   ----  -------  ----  ---------  --------  ---------
-          //     4   [0,  4]    3    [0,  7]      10      mod(10)  <-- SSE42, ?
-          //     8   [0,  8]    4    [0, 15]       8         3     <-- GEN*,AVX*
-          //    16   [0, 16]    5    [0, 31]       6      mod(6)   <-- GEN*,?
-          //    32   [0, 32]    6    [0, 63]       5      mod(5)   <-- CUDA,PowerVR,Adreno,GEN*
-          //    64   [0, 64]    7    [0,127]       4         2     <-- AMD Radeon
-          //
-          // NOTE: When possible, bias the hash toward using more y
-          // bits because of:
-          //
-          //   1. the 90 degree counter-clockwise rotation that we put
-          //      in place to offset the render-time clockwise
-          //      rotation
-          //
-          //   2. the likely presence of left-to-right or
-          //      right-to-left glyphs.
-          //
-          // For power-of-two bins, the hash is easy.
-          //
-          // For non-power-of-two, we may want to either implement a
-          // fast mod (compiler should do this for us... hahahaha) or
-          // drop down to the next power-of-two.
-          //
-
-          //
-          // FIXME -- this snarl is not good -- can probably reduce
-          // some of the sign casting but some is there to vectorize a
-          // scalar
-          //
-          SKC_RASTERIZE_INT       const z0y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y);
-          SKC_RASTERIZE_INT       const z1y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y);
-
-          SKC_RASTERIZE_INT       const z0x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x);
-          SKC_RASTERIZE_INT       const z1x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x);
-
-          SKC_RASTERIZE_INT       const min_y  = min(z0y,z1y);
-          SKC_RASTERIZE_INT       const max_y  = max(z0y,z1y);
-
-          SKC_RASTERIZE_INT       const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2;
-
-          SKC_RASTERIZE_UINT      const ty     = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y;
-          SKC_RASTERIZE_INT             dy     = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y);
-
-          //
-          // map [+1,+32] to [ 0,+31]
-          // map [-1,-32] to [-1,-32]
-          //
-          SKC_RASTERIZE_INT             dys    = (dy + (~dy >> 31)) << 26;
-
-          SKC_RASTERIZE_INT       const min_x  = min(z0x,z1x);
-          SKC_RASTERIZE_INT       const max_x  = max(z0x,z1x);
-          SKC_RASTERIZE_INT       const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2;
-
-          SKC_RASTERIZE_UINT      const tx     = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X;
-          SKC_RASTERIZE_UINT      const sx     = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x);
-
-          SKC_RASTERIZE_UINT      const tts    = dys | (ty << 16) | (sx << 10) | tx;
-
-          SKC_RASTERIZE_UINT      const hash   = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) |
-                                                   (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK));
-
-          SKC_RASTERIZE_UINT      const yx     = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF));
-
-#if 0
-          printf("(%3u, %3u)\n",tile_y,tile_x);
-#endif
-
-#if 0
-          if (is_active)
-            printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx);
-#endif
-
-          //
-          // debug
-          //
-#if 0 // PRINTF_ENABLE
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A)                                         \
-          if (is_active C)                                              \
-            printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C);
-
-          SKC_RASTERIZE_VECTOR_EXPAND();
-#else
-          if (is_active)
-            printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash);
-#endif
-
-#endif
-          //
-          // flush all active lanes
-          //
-          while (true)
-            {
-              //
-              // either gather load or vector load+shuffle the yx keys
-              //
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-              SKC_RASTERIZE_BIN       const yx_bin     = smem->bin.vN.yx;
-              SKC_RASTERIZE_UINT      const yx_cur     = shuffle(yx_bin,hash);
-#else
-              SKC_RASTERIZE_UINT      const yx_cur     = smem->bin.aN.yx[hash];
-#endif
-
-              //
-              // does yx for lane match yx for hash?
-              //
-              SKC_RASTERIZE_UINT      const active_yx  = is_active ? yx : SKC_RASTERIZE_YX_INVALID;
-              SKC_RASTERIZE_PREDICATE const is_match   = (yx_cur == active_yx);
-
-              //
-              // OpenCL spec: "When casting a bool to a vector integer
-              // data type, the vector components will be set to -1
-              // (i.e. all bits set) if the vector bool value is true
-              // and 0 otherwise.
-              //
-#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
-              SKC_RASTERIZE_UINT      const h_match    = (SKC_RASTERIZE_UINT)is_match;
-#else
-              SKC_RASTERIZE_UINT      const h_match    = abs(is_match); // {-1,0} -> {+1,0}
-#endif
-              //
-              // how many new elements for each matching hash bin?
-              //
-              SKC_RASTERIZE_UINT      const h_shl      = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS;
-              SKC_RASTERIZE_UINT      const h          = h_match << h_shl;
-
-              //
-              // prefix sum all of the bins in parallel
-              //
-              SKC_RASTERIZE_UINT      const h_iss      = skc_subgroup_scan_inclusive_add_uint(h);
-              SKC_RASTERIZE_UINT      const h_total    = skc_subgroup_last_uint(h_iss);
-
-              //
-              // current bin counts
-              //
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-              SKC_RASTERIZE_BIN       const count_bin  = smem->bin.vN.count;
-              SKC_RASTERIZE_UINT      const count_cur  = shuffle(count_bin,hash);
-#else
-              SKC_RASTERIZE_UINT      const count_cur  = smem->bin.aN.count[hash];
-#endif
-
-              //
-              // calculate where each cache-hit and in-bounds tts should be stored
-              //
-              SKC_RASTERIZE_UINT      const ttsb_index = (h_iss   >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1;
-              SKC_RASTERIZE_UINT      const count_new  = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur;
-
-              //
-              // which lanes can append to a matching bin?
-              //
-              SKC_RASTERIZE_PREDICATE const is_append  = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS);
-
-              //
-              // scatter append tts elements to bin blocks
-              //
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
-              //
-              // SIMD
-              //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A)                                         \
-              if (is_append C)                                          \
-                {                                                       \
-                  smem->bin.aN.ttsb [hash C][ttsb_index C] = tts       C; \
-                  smem->bin.aN.count[hash C]               = count_new C; \
-                }
-
-              SKC_RASTERIZE_VECTOR_EXPAND();
-#else
-              //
-              // SIMT
-              //
-              if (is_append)
-                {
-                  smem->bin.aN.ttsb [hash][ttsb_index] = tts;
-                  smem->bin.aN.count[hash]             = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS
-                }
-#endif
-              //
-              // try to keep predicate updates SIMD-friendly and
-              // outside of predicated code paths -- this is not
-              // always how we would normally do things on SIMT but
-              // either approach is acceptable
-              //
-
-              //
-              // mask off lanes/components that successfully appended
-              //
-              is_active = is_active && !is_append;
-
-              //
-              // are there any active lanes left?
-              //
-              if (!skc_subgroup_any(is_active))
-                break;
-
-              //
-              // There are active lanes that couldn't be appended to a
-              // bin because their hashes collided with the bin's
-              // current ryx key then those bins must be ejected.
-              //
-              // Note that we do not eject "full" bins because lazily
-              // waiting for a collision results in simpler code.
-              //
-              skc_flush(bp_atomics,
-                        bp_elems,
-                        bp_ids,
-                        bp_mask,
-                        cohort_atomics,
-                        subblocks,
-                        blocks,
-                        blocks_next,
-                        sk_v,
-                        sk_v_next,
-                        sk_extent,
-                        smem,
-                        hash,
-                        yx,
-                        is_active);
-            }
-        }
-    }
-}
-
-//
-// INITIALIZE SMEM
-//
-// Note that SIMD/SIMT have nearly the same syntax.
-//
-static
-void
-skc_smem_init(__local struct skc_subgroup_smem volatile * const smem)
-{
-  //
-  // initialize smem bins
-  //
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-  //
-  // SIMD
-  //
-  smem->bin.vN.yx    = ( SKC_RASTERIZE_YX_INIT );
-  smem->bin.vN.count = ( 0 );
-#else
-  //
-  // SIMT
-  //
-  int idx = skc_subgroup_lane();
-
-#if   ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
-  if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT)
-#elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
-  for (; idx<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; idx+=SKC_RASTERIZE_SUBGROUP_SIZE)
-#endif
-    {
-      smem->bin.aN.yx   [idx] = ( SKC_RASTERIZE_YX_INIT );
-      smem->bin.aN.count[idx] = ( 0 );
-    }
-#endif
-}
-
-//
-// RASTERIZE CUBIC KERNEL
-//
-
-static
-void
-skc_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                     __global union skc_bp_elem                * const bp_elems,
-                     __global uint                             * const bp_ids,
-                     skc_uint                                    const bp_mask,
-
-                     __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                     __global skc_ttsk_s_t                     * const sk_extent,
-
-                     __local struct skc_subgroup_smem volatile * const smem,
-
-                     skc_uint                                  * const nodeword,
-                     skc_block_id_t                            * const id,
-
-                     union skc_transform              const    * const tv,
-                     union skc_path_clip              const    * const cv,
-                     skc_uint                                    const cohort)
-{
-  //
-  // the initial segment idx and segments-per-block constant determine
-  // how many block ids will need to be loaded
-  //
-  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  //
-  // apply transform
-  //
-  // note that we only care if the end points are rounded to subpixel precision
-  //
-  // FIXME -- transformation is currently affine-only support perspective later
-  //
-  // the affine transformation requires 8 FMA + 2 ROUND operations
-  //
-  SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);
-  SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);
-
-  SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
-  SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;
-
-  SKC_RASTERIZE_FLOAT const t2x = c2x * tv->sx  + c2y * tv->shx + tv->tx;
-  SKC_RASTERIZE_FLOAT const t2y = c2x * tv->shy + c2y * tv->sy  + tv->ty;
-
-  SKC_RASTERIZE_FLOAT const t3x = round(c3x * tv->sx  + c3y * tv->shx + tv->tx);
-  SKC_RASTERIZE_FLOAT const t3y = round(c3x * tv->shy + c3y * tv->sy  + tv->ty);
-
-  //
-  //
-  //
-#if PRINTF_ENABLE
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A)                                         \
-  printf("{ { %.02f, %.02f }, { %.02f, %.02f },"                        \
-         "  { %.02f, %.02f }, { %.02f, %.02f } },\n",                   \
-         b0x C,b0y C,t1x C,t1y C,                                       \
-         t2x C,t2y C,t3x C,t3y C);
-
-  SKC_RASTERIZE_VECTOR_EXPAND();
-
-#else
-
-  printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n",
-         b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);
-
-#endif
-
-#endif
-
-  //
-  // OLD APPROACH
-  // ------------
-  //
-  // The Spinel CUDA rasterizer was significantly more complex and
-  // performed a few different tasks that are probably best kept
-  // separate.
-  //
-  // The Spinel rasterizer Bezier held 4-element x and y coordinates
-  // in adjacent lanes. This simplified intermingling of single lane
-  // 4-coordinate line segments with two-lane cubic Beziers.
-  //
-  // After transformation of the input segments, the Spinel rasterizer
-  // would test cubics for flatness and, if flat, collapse the
-  // adjacent lanes into a single line lane and an empty lane.
-  //
-  // Any lines would then be appended to a line queue.
-  //
-  // Any cubics would then be subdivided.
-  //
-  // The reclassification process would be repeated.
-  //
-  // NEW APPROACH
-  // ------------
-  //
-  // Assume we're only working with cubics in this kernel.
-  //
-  // Optimization: if the line segment is a special case -- a cusp,
-  // has 1+ inflections, or a loop -- it might be beneficial to
-  // subdivide the control cage 1+ times in order to separate the
-  // flatter segments the high-velocity region(s).
-  //
-  // This means we want to split using [a,b] formulation to _directly_
-  // subdivide producing a new control cage.
-  //
-  // Wang's Formula is still useful even if we subdivide once or twice
-  // as it's so cheap that it might give some useful hints about where
-  // the high-velocity sections of curve reside.
-  //
-  // But it seems like using Wang's and directly flattening to line
-  // segments without any subdivision is good enough for the limited
-  // set of test cases that I've tried.
-  //
-  // So... use Wang's Formula to estimate how many line segment are
-  // required to properly flatten the cubics.
-  //
-  // Then use inclusive/exclusive scans to put all the lanes to work:
-  //
-  //   1. segmenting cubics to line segments
-  //
-  //   2. slivering line segments into 1-pixel high line segments
-  //
-  //   3. slivering 1-pixel high line segments into 1-pixel wide line
-  //      segments
-  //
-  // MORE BACKGROUND ON NEW APPROACH
-  // -------------------------------
-  //
-  // Two options for handling line segments:
-  //
-  // 1. append the line segments onto an SLM array until enough
-  //    work has been accrued (Spinel does this)
-  //
-  // 2. immediately sliver the potentially multi-pixel line
-  //    segments into subpixel lines
-  //
-  // The advantage of (1) is that it guarantees the slivering
-  // process will, on average, always be emitting a full subgroup
-  // of subpixel lines.
-  //
-  // The advantage of (2) is that it reduces code complexity and
-  // leaves more room for SLM tile bins. The difference between Spinel
-  // and Skia Compute is that Wang's Formula guarantees there will be
-  // a full subgroup of multi-pixel lines unless this is the final
-  // iteration of the warp of multi-pixel lines.
-  //
-  // Note that wider GPU architectures might benefit from (1) and
-  // other work accumulation strategies because it will minimize
-  // partial warp workloads in the final iteration of each stage.  It
-  // also minimizes the sunk cost of the uniform control logic steps.
-  //
-  // So let's implement (2) for now...
-  //
-
-  //
-  // And... begin!
-  //
-  // Estimate how many line segments are in quad/cubic curve.
-  //
-  // Wang's Formula will return zero if the control points are
-  // collinear but we bump it up to 1.0f.
-  //
-  SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);
-
-  //
-  // if there are free registers then precalculate the reciprocal for
-  // each estimated segments since it will never change
-  //
-  SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);
-
-
-  //
-  // inclusive add scan of estimated line segments
-  // exclusive add scan of estimated line segments
-  // total number       of estimated line segments
-  //
-  SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);
-  SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;
-  float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar
-
-  //
-  // Precompute cubic polynomial coefficients from transformed control
-  // cage so we can shuffle them in on each iteration of the outer
-  // loop and then evaluate the polynomial in Horner form.
-  //
-  //                            |  1  0  0  0 | | c0 |
-  //                            |             | |    |
-  //                            | -3  3  0  0 | | c1 |
-  //   B(t) = [ 1 t^1 t^2 t^3 ] |             | |    |
-  //                            |  3 -6  3  0 | | c2 |
-  //                            |             | |    |
-  //                            | -1  3 -3  1 | | c3 |
-  //
-  //
-  SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x);                // 2 - 1 MAD + MUL
-  SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y);                // 2 - 1 MAD + MUL
-
-  SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x));  // 3 - 2 MAD + MUL
-  SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y));  // 3 - 2 MAD + MUL
-
-  SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB
-  SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB
-
-  //
-  // these values don't matter on the first iteration
-  //
-  SKC_RASTERIZE_FLOAT l1x_prev  = 0;
-  SKC_RASTERIZE_FLOAT l1y_prev  = 0;
-
-  //
-  // allocate and init in-register TTSK keys
-  //
-  skc_uint     sk_v_next = 0;
-  skc_ttsk_v_t sk_v; 
-
-  sk_v.hi = cohort;
-
-  //
-  // initialize smem
-  //
-  skc_smem_init(smem);
-
-  //
-  // initialize blocks / subblocks
-  //
-  skc_block_id_v_t blocks;
-  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
-
-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
-  skc_block_id_t   subblocks   = 0;
-#endif
-
-  //
-  // loop until done
-  //
-  while (s_rem > 0)
-    {
-      //
-      // distribute work across lanes
-      //
-      SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);
-
-      //
-      // every lane has a fraction to work off of
-      //
-      // FIXME -- this gets expanded on SIMD
-      //
-      // if delta == 1      then this is the first lane
-      // if count == s_segs then this is the last  lane
-      //
-      SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
-      SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);
-
-      SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
-      SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);
-
-      //
-      // init parametric t
-      //
-      SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?
-
-      //
-      // if last then override to a hard 1.0f
-      //
-      s_t    = is_s_last ? 1.0f : s_t;
-
-      //
-      // decrement by subgroup size
-      //
-      s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-      s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-      s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-
-      //
-      // now every lane knows what to do and the following lines will
-      // pump out up to SUBGROUP_SIZE line segments
-      //
-      // obtain the src vertices through shared or via a shuffle
-      //
-
-      //
-      // shuffle in the polynomial coefficients their source lane
-      //
-      SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
-      SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);
-
-      SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
-      SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);
-
-      SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
-      SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);
-
-      SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source);
-      SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source);
-
-      //
-      // calculate "right" line segment endpoint using Horner form
-      //
-      SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND
-      SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND
-
-      //
-      // shuffle up "left" line segment endpoint
-      //
-      // NOTE: Intel's shuffle_up is unique with its elegant
-      // "previous" argument so don't get used to it
-      //
-      SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
-      SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);
-
-      //
-      // save previous right endpoint
-      //
-      l1x_prev = l1x;
-      l1y_prev = l1y;
-
-      //
-      // override shuffle up if this is the first line segment
-      //
-      l0x = select(l0x,s0x,is_s_first);
-      l0y = select(l0y,s0y,is_s_first);
-
-      //
-      // sliver lines
-      //
-      skc_sliver(bp_atomics,
-                 bp_elems,
-                 bp_ids,
-                 bp_mask,
-                 cohort_atomics,
-                 &subblocks,
-                 &blocks,
-                 &blocks_next,
-                 &sk_v,
-                 &sk_v_next,
-                 sk_extent,
-                 smem,
-                 l0x,l0y,l1x,l1y);
-    }
-
-  //
-  // - flush work-in-progress blocks
-  // - return unused block ids
-  //
-  skc_finalize(bp_atomics,
-               bp_elems,
-               bp_ids,
-               bp_mask,
-               cohort_atomics,
-               &blocks,
-               blocks_next,
-               &sk_v,
-               sk_v_next,
-               sk_extent,
-               smem);
-}
-
-//
-// RASTERIZE QUAD KERNEL
-//
-
-static
-void
-skc_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                    __global union skc_bp_elem                * const bp_elems,
-                    __global uint                             * const bp_ids,
-                    skc_uint                                    const bp_mask,
-
-                    __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                    __global skc_ttsk_s_t                     * const sk_extent,
-
-                    __local struct skc_subgroup_smem volatile * const smem,
-                    
-                    skc_uint                                  * const nodeword,
-                    skc_block_id_t                            * const id,
-
-                    union skc_transform              const    * const tv,
-                    union skc_path_clip              const    * const cv,
-                    skc_uint                                    const cohort)
-{
-  //
-  // the initial segment idx and segments-per-block constant determine
-  // how many block ids will need to be loaded
-  //
-  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  //
-  // apply transform
-  //
-  // note that we only care if the end points are rounded to subpixel precision
-  //
-  // FIXME -- transformation is currently affine-only support perspective later
-  //
-  // the affine transformation requires 8 FMA + 2 ROUND operations
-  //
-  SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);
-  SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);
-
-  SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
-  SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;
-
-  SKC_RASTERIZE_FLOAT const t2x = round(c2x * tv->sx  + c2y * tv->shx + tv->tx);
-  SKC_RASTERIZE_FLOAT const t2y = round(c2x * tv->shy + c2y * tv->sy  + tv->ty);
-
-  //
-  // Estimate how many line segments are in quad/cubic curve.
-  //
-  // Wang's Formula will return zero if the control points are
-  // collinear but we bump it up to 1.0f.
-  //
-  SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y);
-
-  //
-  // if there are free registers then precalculate the reciprocal for
-  // each estimated segments since it will never change
-  //
-  SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);
-
-
-  //
-  // inclusive add scan of estimated line segments
-  // exclusive add scan of estimated line segments
-  // total number       of estimated line segments
-  //
-  SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);
-  SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;
-  float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar
-
-  //
-  // Precompute quadratic polynomial coefficients from control cage so
-  // we can shuffle them in on each iteration of the outer loop and
-  // then evaluate the polynomial in Horner form.
-  //
-
-  //                        |  1  0  0  | | c0 |
-  //                        |           | |    |
-  //   B(t) = [ 1 t^1 t^2 ] | -2  2  0  | | c1 |
-  //                        |           | |    |
-  //                        |  1 -2  1  | | c2 |
-  //
-  //
-  SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL
-  SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL
-
-  SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x);  // 2 - 1 MAD + ADD
-  SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y);  // 2 - 1 MAD + ADD
-
-  //
-  // these values don't matter on the first iteration
-  //
-  SKC_RASTERIZE_FLOAT l1x_prev  = 0;
-  SKC_RASTERIZE_FLOAT l1y_prev  = 0;
-
-  //
-  // allocate and init in-register TTSK keys
-  //
-  skc_uint     sk_v_next = 0;
-  skc_ttsk_v_t sk_v; 
-
-  sk_v.hi = cohort;
-
-  //
-  // initialize smem
-  //
-  skc_smem_init(smem);
-
-  //
-  // initialize blocks / subblocks
-  //
-  skc_block_id_v_t blocks;
-  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
-
-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
-  skc_block_id_t   subblocks   = 0;
-#endif
-
-  //
-  // loop until done
-  //
-  while (s_rem > 0)
-    {
-      //
-      // distribute work across lanes
-      //
-      SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);
-
-      //
-      // every lane has a fraction to work off of
-      //
-      // FIXME -- this gets expanded on SIMD
-      //
-      // if delta == 1      then this is the first lane
-      // if count == s_segs then this is the last  lane
-      //
-      SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
-      SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);
-
-      SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
-      SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);
-
-      //
-      // init parametric t
-      //
-      SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?
-
-      //
-      // if last then override to a hard 1.0f
-      //
-      s_t    = is_s_last ? 1.0f : s_t;
-
-      //
-      // decrement by subgroup size
-      //
-      s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-      s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-      s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-
-      //
-      // now every lane knows what to do and the following lines will
-      // pump out up to SUBGROUP_SIZE line segments
-      //
-      // obtain the src vertices through shared or via a shuffle
-      //
-
-      //
-      // shuffle in the polynomial coefficients their source lane
-      //
-      SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
-      SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);
-
-      SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
-      SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);
-
-      SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
-      SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);
-
-      //
-      // calculate "right" line segment endpoint using Horner form
-      //
-      SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND
-      SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND
-
-      //
-      // shuffle up "left" line segment endpoint
-      //
-      // NOTE: Intel's shuffle_up is unique with its elegant
-      // "previous" argument so don't get used to it
-      //
-      SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
-      SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);
-
-      //
-      // save previous right endpoint
-      //
-      l1x_prev = l1x;
-      l1y_prev = l1y;
-
-      //
-      // override shuffle up if this is the first line segment
-      //
-      l0x = select(l0x,s0x,is_s_first);
-      l0y = select(l0y,s0y,is_s_first);
-
-      //
-      // sliver lines
-      //
-      skc_sliver(bp_atomics,
-                 bp_elems,
-                 bp_ids,
-                 bp_mask,
-                 cohort_atomics,
-                 &subblocks,
-                 &blocks,
-                 &blocks_next,
-                 &sk_v,
-                 &sk_v_next,
-                 sk_extent,
-                 smem,
-                 l0x,l0y,l1x,l1y);
-    }
-
-  //
-  // - flush work-in-progress blocks
-  // - return unused block ids
-  //
-  skc_finalize(bp_atomics,
-               bp_elems,
-               bp_ids,
-               bp_mask,
-               cohort_atomics,
-               &blocks,
-               blocks_next,
-               &sk_v,
-               sk_v_next,
-               sk_extent,
-               smem);
-}
-
-//
-// RASTERIZE LINE KERNEL
-//
-
-static
-void
-skc_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                    __global union skc_bp_elem                * const bp_elems,
-                    __global uint                             * const bp_ids,
-                    skc_uint                                    const bp_mask,
-
-                    __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                    __global skc_ttsk_s_t                     * const sk_extent,
-
-                    __local struct skc_subgroup_smem volatile * const smem,
-                    
-                    skc_uint                                  * const nodeword,
-                    skc_block_id_t                            * const id,
-
-                    union skc_transform              const    * const tv,
-                    union skc_path_clip              const    * const cv,
-                    skc_uint                                    const cohort)
-{
-  //
-  // the initial segment idx and segments-per-block constant determine
-  // how many block ids will need to be loaded
-  //
-  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-  skc_segment_next(bp_elems,nodeword,id);
-
-  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-#if 0
-  // printf("%5u : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",(skc_uint)get_global_id(0),c0x,c0y,c1x,c1y);
-  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y);
-#endif
-
-  //
-  // apply transform
-  //
-  // note that we only care if the end points are rounded to subpixel precision
-  //
-  // FIXME -- transformation is currently affine-only
-  // FIXME -- support perspective later
-  //
-  // the affine transformation requires 8 FMA + 4 ROUND operations
-  //
-  SKC_RASTERIZE_FLOAT const l0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);
-  SKC_RASTERIZE_FLOAT const l0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);
-
-  SKC_RASTERIZE_FLOAT const l1x = round(c1x * tv->sx  + c1y * tv->shx + tv->tx);
-  SKC_RASTERIZE_FLOAT const l1y = round(c1x * tv->shy + c1y * tv->sy  + tv->ty);
-
-#if 0
-  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y);
-#endif
-
-  //
-  // allocate and init in-register TTSK keys
-  //
-  skc_uint     sk_v_next = 0;
-  skc_ttsk_v_t sk_v; 
-
-  sk_v.hi = cohort;
-
-  //
-  // initialize smem
-  //
-  skc_smem_init(smem);
-
-  //
-  // initialize blocks / subblocks
-  //
-  skc_block_id_v_t blocks;
-  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
-
-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
-  skc_block_id_t   subblocks   = 0;
-#endif
-
-  //
-  // sliver lines
-  //
-  skc_sliver(bp_atomics,
-             bp_elems,
-             bp_ids,
-             bp_mask,
-             cohort_atomics,
-             &subblocks,
-             &blocks,
-             &blocks_next,
-             &sk_v,
-             &sk_v_next,
-             sk_extent,
-             smem,
-             l0x,l0y,l1x,l1y);
-
-  //
-  // - flush work-in-progress blocks
-  // - return unused block ids
-  //
-  skc_finalize(bp_atomics,
-               bp_elems,
-               bp_ids,
-               bp_mask,
-               cohort_atomics,
-               &blocks,
-               blocks_next,
-               &sk_v,
-               sk_v_next,
-               sk_extent,
-               smem);
-}
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                         __global union skc_bp_elem                * const bp_elems,
-                         __global uint                             * const bp_ids,
-                         skc_uint                                    const bp_mask,
-
-                         __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                         __global skc_ttsk_s_t                     * const sk_extent,
-
-                         __global float8                  const    * const transforms, // FIXME -- __constant
-                         __global float4                  const    * const clips,      // FIXME -- __constant
-                         __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
-                         skc_uint                                    const count)
-{
-  //
-  // declare shared memory block
-  //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
-  __local struct skc_subgroup_smem volatile                smem[1];
-#else
-  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
-  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
-#endif
-  
-  //
-  // this is a subgroup/warp-centric kernel
-  //
-  // which subgroup in the grid is this?
-  //
-  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
-  // get_group_id(0) as a uniform but the alternative calculation used
-  // when there are multiple subgroups per workgroup is not
-  // cooperating and driving spillage elsewhere.
-  //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
-  uint const cmd_idx = get_group_id(0);
-#else
-  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("+cmd_idx = %u\n",cmd_idx);
-#endif
-
-  //
-  // if worksgroups are multi-subgroup then there may be excess
-  // subgroups in the final workgroup
-  //
-  if (cmd_idx >= count)
-    return;
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("-cmd_idx = %u\n",cmd_idx);
-#endif
-
-  //
-  // load a single command for this subgroup
-  //
-  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("[ %u ]< %u, %u, %u, %u >\n",
-           cmd_idx,
-           cmd.nodeword,
-           SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd),
-           SKC_CMD_RASTERIZE_GET_CLIP(cmd),
-           SKC_CMD_RASTERIZE_GET_COHORT(cmd));
-#endif
-
-  //
-  // get first block node command word and its subblock
-  //
-  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
-  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
-  skc_block_id_tag      tag      = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id);
-  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
-
-  //
-  // load transform -- uniform across subgroup
-  //
-  // v8: { sx shx tx shy sy ty w0 w1 }
-  //
-  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
-  //
-  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
-  //
-  // Coordinates are scaled to subpixel resolution.  All that matters
-  // is that continuity is maintained between end path element
-  // endpoints.
-  //
-  // It's the responsibility of the host to ensure that the transforms
-  // are properly scaled either via intitializing a transform stack
-  // with the subpixel resolution scaled identity or scaling the
-  // transform before its loaded by a rasterization grid.
-  //
-  // FIXME -- horizontal load might be better than this broadcast load
-  //
-  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
-  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
-  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
-
-  switch (tag)
-    {
-    case SKC_BLOCK_ID_TAG_PATH_LINE:
-      skc_rasterize_lines(bp_atomics,
-                          bp_elems,
-                          bp_ids,
-                          bp_mask,
-                          cohort_atomics,
-                          sk_extent,
-                          smem,
-                          &nodeword,&id,
-                          &tv,&cv,cohort);
-      break;
-
-    case SKC_BLOCK_ID_TAG_PATH_QUAD:
-      skc_rasterize_quads(bp_atomics,
-                          bp_elems,
-                          bp_ids,
-                          bp_mask,
-                          cohort_atomics,
-                          sk_extent,
-                          smem,
-                          &nodeword,&id,
-                          &tv,&cv,cohort);
-      break;
-
-    case SKC_BLOCK_ID_TAG_PATH_CUBIC:
-      skc_rasterize_cubics(bp_atomics,
-                           bp_elems,
-                           bp_ids,
-                           bp_mask,
-                           cohort_atomics,
-                           sk_extent,
-                           smem,
-                           &nodeword,&id,
-                           &tv,&cv,cohort);
-      break;
-
-    case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD:
-      break;
-    case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC:
-      break;
-
-    default:
-      break;
-    }
-}
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                           __global union skc_bp_elem                * const bp_elems,
-                           __global uint                             * const bp_ids,
-                           skc_uint                                    const bp_mask,
-
-                           __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                           __global skc_ttsk_s_t                     * const sk_extent,
-
-                           __global float8                  const    * const transforms, // FIXME -- __constant
-                           __global float4                  const    * const clips,      // FIXME -- __constant
-                           __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
-                           skc_uint                                    const count)
-{
-  //
-  // declare shared memory block
-  //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
-  __local struct skc_subgroup_smem volatile                smem[1];
-#else
-  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
-  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
-#endif
-  
-  //
-  // this is a subgroup/warp-centric kernel
-  //
-  // which subgroup in the grid is this?
-  //
-  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
-  // get_group_id(0) as a uniform but the alternative calculation used
-  // when there are multiple subgroups per workgroup is not
-  // cooperating and driving spillage elsewhere.
-  //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
-  uint const cmd_idx = get_group_id(0);
-#else
-  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
-  //
-  // if worksgroups are multi-subgroup then there may be excess
-  // subgroups in the final workgroup
-  //
-  if (cmd_idx >= count)
-    return;
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("cmd_idx = %u\n",cmd_idx);
-#endif
-
-  //
-  // load a single command for this subgroup
-  //
-  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
-
-  //
-  // get first block node command word and its subblock
-  //
-  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
-  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
-  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
-
-  //
-  // load transform -- uniform across subgroup
-  //
-  // v8: { sx shx tx shy sy ty w0 w1 }
-  //
-  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
-  //
-  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
-  //
-  // Coordinates are scaled to subpixel resolution.  All that matters
-  // is that continuity is maintained between end path element
-  // endpoints.
-  //
-  // It's the responsibility of the host to ensure that the transforms
-  // are properly scaled either via intitializing a transform stack
-  // with the subpixel resolution scaled identity or scaling the
-  // transform before its loaded by a rasterization grid.
-  //
-  // FIXME -- horizontal load might be better than this broadcast load
-  //
-  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
-  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
-  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
-
-  skc_rasterize_lines(bp_atomics,
-                      bp_elems,
-                      bp_ids,
-                      bp_mask,
-                      cohort_atomics,
-                      sk_extent,
-                      smem,
-                      &nodeword,&id,
-                      &tv,&cv,cohort);
-}
-
-//
-//
-//
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                           __global union skc_bp_elem                * const bp_elems,
-                           __global uint                             * const bp_ids,
-                           skc_uint                                    const bp_mask,
-
-                           __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                           __global skc_ttsk_s_t                     * const sk_extent,
-
-                           __global float8                  const    * const transforms, // FIXME -- __constant
-                           __global float4                  const    * const clips,      // FIXME -- __constant
-                           __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
-                           skc_uint                                    const count)
-{
-  //
-  // declare shared memory block
-  //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
-  __local struct skc_subgroup_smem volatile                smem[1];
-#else
-  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
-  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
-#endif
-  
-  //
-  // this is a subgroup/warp-centric kernel
-  //
-  // which subgroup in the grid is this?
-  //
-  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
-  // get_group_id(0) as a uniform but the alternative calculation used
-  // when there are multiple subgroups per workgroup is not
-  // cooperating and driving spillage elsewhere.
-  //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
-  uint const cmd_idx = get_group_id(0);
-#else
-  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
-  //
-  // if worksgroups are multi-subgroup then there may be excess
-  // subgroups in the final workgroup
-  //
-  if (cmd_idx >= count)
-    return;
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("cmd_idx = %u\n",cmd_idx);
-#endif
-
-  //
-  // load a single command for this subgroup
-  //
-  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
-
-  //
-  // get first block node command word and its subblock
-  //
-  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
-  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
-  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
-
-  //
-  // load transform -- uniform across subgroup
-  //
-  // v8: { sx shx tx shy sy ty w0 w1 }
-  //
-  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
-  //
-  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
-  //
-  // Coordinates are scaled to subpixel resolution.  All that matters
-  // is that continuity is maintained between end path element
-  // endpoints.
-  //
-  // It's the responsibility of the host to ensure that the transforms
-  // are properly scaled either via intitializing a transform stack
-  // with the subpixel resolution scaled identity or scaling the
-  // transform before its loaded by a rasterization grid.
-  //
-  // FIXME -- horizontal load might be better than this broadcast load
-  //
-  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
-  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
-  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
-
-  skc_rasterize_quads(bp_atomics,
-                      bp_elems,
-                      bp_ids,
-                      bp_mask,
-                      cohort_atomics,
-                      sk_extent,
-                      smem,
-                      &nodeword,&id,
-                      &tv,&cv,cohort);
-}
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                            __global union skc_bp_elem                * const bp_elems,
-                            __global uint                             * const bp_ids,
-                            skc_uint                                    const bp_mask,
-
-                            __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                            __global skc_ttsk_s_t                     * const sk_extent,
-
-                            __global float8                  const    * const transforms, // FIXME -- __constant
-                            __global float4                  const    * const clips,      // FIXME -- __constant
-                            __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
-                            skc_uint                                    const count)
-{
-  //
-  // declare shared memory block
-  //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
-  __local struct skc_subgroup_smem volatile                smem[1];
-#else
-  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
-  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
-#endif
-  
-  //
-  // this is a subgroup/warp-centric kernel
-  //
-  // which subgroup in the grid is this?
-  //
-  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
-  // get_group_id(0) as a uniform but the alternative calculation used
-  // when there are multiple subgroups per workgroup is not
-  // cooperating and driving spillage elsewhere.
-  //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
-  uint const cmd_idx = get_group_id(0);
-#else
-  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
-  //
-  // if worksgroups are multi-subgroup then there may be excess
-  // subgroups in the final workgroup
-  //
-  if (cmd_idx >= count)
-    return;
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("cmd_idx = %u\n",cmd_idx);
-#endif
-
-  //
-  // load a single command for this subgroup
-  //
-  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
-
-  //
-  // get first block node command word and its subblock
-  //
-  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
-  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
-  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
-
-  //
-  // load transform -- uniform across subgroup
-  //
-  // v8: { sx shx tx shy sy ty w0 w1 }
-  //
-  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
-  //
-  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
-  //
-  // Coordinates are scaled to subpixel resolution.  All that matters
-  // is that continuity is maintained between end path element
-  // endpoints.
-  //
-  // It's the responsibility of the host to ensure that the transforms
-  // are properly scaled either via intitializing a transform stack
-  // with the subpixel resolution scaled identity or scaling the
-  // transform before its loaded by a rasterization grid.
-  //
-  // FIXME -- horizontal load might be better than this broadcast load
-  //
-  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
-  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
-  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
-
-  skc_rasterize_cubics(bp_atomics,
-                       bp_elems,
-                       bp_ids,
-                       bp_mask,
-                       cohort_atomics,
-                       sk_extent,
-                       smem,
-                       &nodeword,&id,
-                       &tv,&cv,cohort);
-}
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                               __global union skc_bp_elem                * const bp_elems,
-                               __global uint                             * const bp_ids,
-                               skc_uint                                    const bp_mask,
-
-                               __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                               __global skc_ttsk_s_t                     * const sk_extent,
-
-                               __global float8                  const    * const transforms, // FIXME -- __constant
-                               __global float4                  const    * const clips,      // FIXME -- __constant
-                               __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
-                               skc_uint                                    const count)
-{
-  ;
-}
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
-                                __global union skc_bp_elem                * const bp_elems,
-                                __global uint                             * const bp_ids,
-                                skc_uint                                    const bp_mask,
-
-                                __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
-                                __global skc_ttsk_s_t                     * const sk_extent,
-
-                                __global float8                  const    * const transforms, // FIXME -- __constant
-                                __global float4                  const    * const clips,      // FIXME -- __constant
-                                __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
-                                skc_uint                                    const count)
-{
-  ;
-}
-
-//
-//
-//
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "common.h"
+#include "atomic_cl.h"
+#include "block_pool_cl.h"
+#include "raster_builder_cl_12.h"
+#include "kernel_cl_12.h"
+
+// #define SKC_ARCH_AVX2
+// #define SKC_RASTERIZE_SIMD_USES_SMEM
+
+#define PRINTF_ENABLE       0
+#define PRINTF_BLOCK_COUNT  0
+
+//
+// NOTE:
+//
+// ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT
+// AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE
+//
+// NOTE:
+//
+// IGNORE FOR NOW ANY AVX2 CODE SNIPPETS.  THEY WILL BE MOVED ASAP.
+//
+//
+
+#if 0 // SKC_ARCH_AVX2
+
+// #define SKC_RASTERIZE_SUBGROUP_SIZE              1
+// #define SKC_RASTERIZE_VECTOR_SIZE_LOG2           3
+// #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP   1
+
+// #define SKC_TTXB_WORDS                           8
+
+// #define SKC_RASTERIZE_FLOAT                      float8
+// #define SKC_RASTERIZE_UINT                       uint8
+// #define SKC_RASTERIZE_INT                        int8
+// #define SKC_RASTERIZE_PREDICATE                  int8
+
+// #define SKC_RASTERIZE_BIN_BLOCK                  uint16
+// #define SKC_RASTERIZE_BIN                        uint8
+
+// #define SKC_RASTERIZE_POOL                       uint8
+// #define SKC_RASTERIZE_POOL_SCALE                 6
+
+// #define SKC_RASTERIZE_TILE_HASH_X_BITS           1
+// #define SKC_RASTERIZE_TILE_HASH_Y_BITS           2
+
+// #define SKC_RASTERIZE_VECTOR_EXPAND()            SKC_EXPAND_8()
+
+#endif
+
+//
+// SIMT
+//
+
+#define SKC_RASTERIZE_BLOCK_ID_V_SIZE        SKC_RASTERIZE_SUBGROUP_SIZE
+#define SKC_RASTERIZE_TTSK_V_SIZE            SKC_RASTERIZE_SUBGROUP_SIZE
+#define SKC_RASTERIZE_TTSK_V_MASK            (SKC_RASTERIZE_TTSK_V_SIZE - 1)
+
+//
+//
+//
+
+#define SKC_RASTERIZE_VECTOR_SIZE            (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2)
+#define SKC_RASTERIZE_ELEMS_PER_SUBGROUP     (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE)
+
+//
+//
+//
+
+#define SKC_RASTERIZE_YX_INIT                0x7FFF7FFF  // { +32767, +32767 }
+#define SKC_RASTERIZE_YX_INVALID             0x80008000  // { -32768, -32768 }
+
+//
+//
+//
+
+#define SKC_RASTERIZE_TILE_HASH_X_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS)
+#define SKC_RASTERIZE_TILE_HASH_Y_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS)
+#define SKC_RASTERIZE_TILE_HASH_BITS         (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS)
+#define SKC_RASTERIZE_TILE_HASH_BIN_COUNT    (1 << SKC_RASTERIZE_TILE_HASH_BITS)
+#define SKC_RASTERIZE_TILE_HASH_BIN_BITS     (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT)
+#define SKC_RASTERIZE_TILE_HASH_BIN_MASK     SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS)
+
+//
+// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
+//
+// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
+//
+// Lerp in two fma/mad ops:
+//
+//    t * b + ((-t) * a + a)
+//
+// Note: OpenCL documents mix() as being implemented as:
+//
+//    a + (b - a) * t
+//
+// But this may be a native instruction on some devices. For example,
+// on GEN9 there is an LRP "linear interoplation" opcode but it
+// doesn't appear to support half floats.
+//
+// Feel free to toggle this option and then benchmark and inspect the
+// generated code.  We really want the double FMA to be generated when
+// there isn't support for a LERP/MIX operation.
+//
+
+#if 1
+#define SKC_LERP(a,b,t)      mad(t,b,mad(-(t),a,a))
+#else
+#define SKC_LERP(a,b,t)      mix(a,b,t)
+#endif
+
+//
+// There is no integer MAD in OpenCL with "don't care" overflow
+// semantics.
+//
+// FIXME -- verify if the platform needs explicit MAD operations even
+// if a "--fastmath" option is available at compile time.  It might
+// make sense to explicitly use MAD calls if the platform requires it.
+//
+
+#if 1
+#define SKC_MAD_UINT(a,b,c)  ((a) * (b) + (c))
+#else
+#define SKC_MAD_UINT(a,b,c)  mad_sat(a,b,c)
+#endif
+
+//
+//
+//
+
+#define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane())
+
+//
+//
+//
+
+union skc_bp_elem
+{
+  skc_uint              u32;
+  skc_tagged_block_id_t tag_id;
+  skc_float             coord;
+};
+
+//
+//
+//
+
+struct skc_subgroup_smem
+{
+  //
+  // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member
+  //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM )
+  struct {
+    union {
+
+      skc_uint                winner;
+
+      struct {
+        skc_uint              scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
+      } aN;
+
+      struct {
+        SKC_RASTERIZE_UINT    scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
+      } vN;
+    };
+  } subgroup;
+#endif
+
+  //
+  // work-in-progress TTSB blocks and associated YX keys
+  //
+  union {
+    struct {
+      // FIXME -- some typedefs are valid here
+      skc_uint                ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS];
+      skc_uint                yx   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
+      skc_uint                id   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
+      skc_uint                count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
+    } aN;
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+    struct {
+      SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
+      SKC_RASTERIZE_BIN       yx;
+      SKC_RASTERIZE_BIN       id;
+      SKC_RASTERIZE_BIN       count;
+    } vN;
+#endif
+  } bin;
+};
+
+//
+//
+//
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+#define skc_subgroup_lane()  0
+#else
+#define skc_subgroup_lane()  get_sub_group_local_id()
+#endif
+
+//
+// replenish block ids
+//
+// note that you can't overrun the block id pool since it's a ring
+//
+
+static
+void
+skc_blocks_replenish(skc_uint                           * const blocks_next,
+                     skc_block_id_v_t                   * const blocks,
+                     __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
+                     skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
+                     __global skc_block_id_t   const    * const bp_ids)
+{
+  //
+  // get a new vector of block ids -- this is kind of a narrow
+  // allocation but subblocks help stretch out the pool.
+  //
+  // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids
+  //
+  skc_uint bp_idx = 0;
+
+  if (skc_subgroup_lane() == 0)
+    {
+      bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,
+                                                    SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads
+#if 0
+      printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE);
+#endif
+    }
+
+  bp_idx       = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask;
+  *blocks      = bp_ids[bp_idx];
+  *blocks_next = 0;
+}
+
+//
+//
+//
+
+static
+skc_block_id_t
+skc_blocks_get_next(skc_uint                           * const blocks_next,
+                    skc_block_id_v_t                   * const blocks,
+                    __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
+                    skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
+                    __global skc_block_id_t   const    * const bp_ids)
+{
+  // replenish?
+  if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE)
+    {
+      skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
+    }
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 )
+  //
+  // SIMT
+  //
+  skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);
+
+#else
+  //
+  // SIMD
+  //
+  skc_block_id_t id = blocks->s0;
+
+  skc_shuffle_down_1(*blocks);
+
+#endif
+
+  *blocks_next += 1;
+
+  return id;
+}
+
+//
+// subblock allocator
+//
+
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+
+static
+skc_block_id_t
+skc_subblocks_get_next(skc_block_id_t                     * const subblocks,
+                       skc_uint                           * const blocks_next,
+                       skc_block_id_v_t                   * const blocks,
+                       __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
+                       skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
+                       __global skc_block_id_t   const    * const bp_ids)
+{
+  if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
+    {
+      *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
+    }
+
+  skc_block_id_t const sb_id = *subblocks;
+
+  *subblocks += 1;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("= %u\n",sb_id);
+#endif
+
+  return sb_id;
+}
+
+
+#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks
+#define SKC_SUBBLOCKS_BLOCKS_ARGS()  subblocks, blocks
+
+#else
+
+#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks
+#define SKC_SUBBLOCKS_BLOCKS_ARGS()  blocks
+
+#endif
+
+//
+//
+//
+
+static
+skc_block_id_t
+skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(),
+                  skc_uint                           * const blocks_next,
+                  __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
+                  skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
+                  __global skc_block_id_t   const    * const bp_ids,
+                  __global SKC_ATOMIC_UINT  volatile * const cohort_atomics,
+                  skc_ttsk_v_t                       * const sk_v,
+                  skc_uint                           * const sk_v_next,
+                  __global skc_ttsk_s_t              * const sk_extent,
+                  skc_uint                             const new_yx)
+{
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+  skc_block_id_t const new_id = skc_subblocks_get_next(subblocks,
+                                                       blocks_next,
+                                                       blocks,
+                                                       bp_atomics,
+                                                       bp_mask,
+                                                       bp_ids);
+#else
+  skc_block_id_t const new_id = skc_blocks_get_next(blocks_next,
+                                                    blocks,
+                                                    bp_atomics,
+                                                    bp_mask, // pow2 modulo mask for block pool ring
+                                                    bp_ids);
+#endif
+
+  if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK))
+    {
+      sk_v->lo = new_id;
+      sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx;
+#if 0
+      printf("@ ( %3u, %3u ) %u\n",
+             (new_yx >> 12) & 0xFFF,
+             (new_yx      ) & 0xFFF,
+             new_id);
+#endif
+    }
+
+  *sk_v_next += 1;
+
+  if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE)
+    {
+      *sk_v_next = 0;
+
+      skc_uint sk_idx = 0;
+
+      if (skc_subgroup_lane() == 0)
+        {
+          sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
+            (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE);
+#if 0
+          printf("+ %u\n",sk_idx);
+#endif
+        }
+
+      sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE )
+      if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE)
+#endif
+        {
+          sk_extent[sk_idx] = *sk_v;
+#if 0
+          printf("> %u : %v2u\n",sk_idx,*sk_v);
+#endif
+        }
+    }
+
+  return new_id;
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  // Note that there isn't a built-in horizontal scan for vectors so
+  // we'll define some here for various widths.
+  //
+  // FIXME -- a scalar version might be faster so put in a
+  // compile-time switch to selection between implementations
+  //
+
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  // 01
+  //  0 +
+  // --
+  // 01
+  SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v);
+  return w;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  // 0123
+  //  012 +
+  // ----
+  // 0123
+  //   01 +
+  // ----
+  // 0123
+  //
+  SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v);
+  SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w);
+  return x;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  // 01234567
+  //  0123456 +
+  // --------
+  // 01234567
+  //   012345 +
+  // --------
+  // 01234567
+  //     0123 +
+  // --------
+  // 01234567
+  //
+  SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v);
+  SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w);
+  SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x);
+  return y;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  // 0123456789abcdef
+  //  0123456789abcde +
+  // ----------------
+  // 0123456789abcdef
+  //   0123456789abcd +
+  // ----------------
+  // 0123456789abcdef
+  //     0123456789ab +
+  // ----------------
+  // 0123456789abcdef
+  //         01234567 +
+  // ----------------
+  // 0123456789abcdef
+  //
+  SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
+  SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
+  SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
+  SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
+  return z;
+
+#endif
+
+#else
+  //
+  // SIMT
+  //
+
+  return sub_group_scan_inclusive_add(v);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_UINT
+skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  // Note that there isn't a built-in horizontal scan for vectors so
+  // we'll define some here for various widths.
+  //
+  // FIXME -- a scalar version might be faster so put in a
+  // compile-time switch to selection between implementations
+  //
+
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  // 01
+  //  0 +
+  // --
+  // 01
+  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v);
+  return w;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  // 0123
+  //  012 +
+  // ----
+  // 0123
+  //   01 +
+  // ----
+  // 0123
+  //
+  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v);
+  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w);
+  return x;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  // 01234567
+  //  0123456 +
+  // --------
+  // 01234567
+  //   012345 +
+  // --------
+  // 01234567
+  //     0123 +
+  // --------
+  // 01234567
+  //
+  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v);
+  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w);
+  SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x);
+  return y;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  // 0123456789abcdef
+  //  0123456789abcde +
+  // ----------------
+  // 0123456789abcdef
+  //   0123456789abcd +
+  // ----------------
+  // 0123456789abcdef
+  //     0123456789ab +
+  // ----------------
+  // 0123456789abcdef
+  //         01234567 +
+  // ----------------
+  // 0123456789abcdef
+  //
+  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
+  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
+  SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
+  SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
+  return z;
+
+#endif
+
+#else
+  //
+  // SIMT
+  //
+
+  return sub_group_scan_inclusive_add(v);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_UINT
+skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  // Note that there isn't a built-in horizontal scan for vectors so
+  // we'll define some here for various widths.
+  //
+  // FIXME -- a scalar version might be faster so put in a
+  // compile-time switch to selection between implementations
+  //
+
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  // 01
+  // 00 max
+  // --
+  // 01
+  SKC_RASTERIZE_UINT const w = max(v.s00,v);
+  return w;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  // 0123
+  // 0012 +
+  // ----
+  // 0123
+  // 0101 +
+  // ----
+  // 0123
+  //
+  SKC_RASTERIZE_UINT const w = max(v.s0012,v);
+  SKC_RASTERIZE_UINT const x = max(w.s0101,w);
+  return x;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  // 01234567
+  // 00123456 +
+  // --------
+  // 01234567
+  // 01012345 +
+  // --------
+  // 01234567
+  // 01230123 +
+  // --------
+  // 01234567
+  //
+  SKC_RASTERIZE_UINT const w = max(v.s00123456,v);
+  SKC_RASTERIZE_UINT const x = max(w.s01012345,w);
+  SKC_RASTERIZE_UINT const y = max(x.s01230123,x);
+  return y;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  // 0123456789abcdef
+  // 00123456789abcde +
+  // ----------------
+  // 0123456789abcdef
+  // 010123456789abcd +
+  // ----------------
+  // 0123456789abcdef
+  // 01230123456789ab +
+  // ----------------
+  // 0123456789abcdef
+  // 0123456701234567 +
+  // ----------------
+  // 0123456789abcdef
+  //
+  SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v);
+  SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w);
+  SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x);
+  SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y);
+  return z;
+
+#endif
+
+#else
+  //
+  // SIMT
+  //
+
+  return sub_group_scan_inclusive_max(v);
+
+#endif
+}
+
+//
+//
+//
+
+static
+float
+skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  return v.s1;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  return v.s3;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  return v.s7;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  return v.sf;
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_UINT
+skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  return v.s1;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  return v.s3;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  return v.s7;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  return v.sf;
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);
+
+#endif
+}
+
+//
+//
+//
+
+static
+float
+skc_subgroup_first(SKC_RASTERIZE_FLOAT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+#else
+  return v.s0;
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return sub_group_broadcast(v,0);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v,
+                      SKC_RASTERIZE_UINT  const i)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+#else
+  return shuffle(v,i);
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return intel_sub_group_shuffle(v,i);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous
+                          SKC_RASTERIZE_FLOAT const c) // current
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  // FIXME -- there are alternative formulations here:
+  //
+  // Option 1:
+  //
+  //   select(c.rotate(+1),p.rotate(-1),(1,0,0,...))
+  //
+  // Option 2:
+  //
+  //   p is a scalar
+  //   t    = c.rotate(+1)
+  //   t.s0 = p;
+  //
+  // Option 3: ...
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return p;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  return shuffle2(p,c,(uint2)(1,2));
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  return shuffle2(p,c,(uint4)(3,4,5,6));
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14));
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30));
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return intel_sub_group_shuffle_up(p,c,1);
+
+#endif
+}
+
+//
+//
+//
+
+static
+bool
+skc_is_lane_first()
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
+  //
+  // SIMD
+  //
+  return true;
+#else
+  //
+  // SIMT
+  //
+  return get_sub_group_local_id() == 0;
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_delta_offset()
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return 1;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  return (SKC_RASTERIZE_FLOAT)( 1, 2 );
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 );
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 );
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 );
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return 1.0f + get_sub_group_local_id();
+
+#endif
+
+}
+
+//
+//
+//
+
+static
+int
+skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  return any(p);
+#else
+  //
+  // SIMT
+  //
+  return sub_group_any(p);
+#endif
+}
+
+//
+//
+//
+
+#define SKC_PATH_NODEWORD_IS_LAST(n)  (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK)
+
+void
+skc_segment_next(__global union skc_bp_elem * const bp_elems,
+                 skc_uint                   * const nodeword,
+                 skc_block_id_t             * const id)
+{
+  if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
+    {
+      if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword))
+        {
+          *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS;
+        }
+
+      skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id;
+
+      *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+    }
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y)
+{
+  return native_sqrt(x * x + y * y);
+}
+
+//
+// Wang's Formula (1985)
+//
+
+#define SKC_WANG_PIXEL_RESL   0.25f // <-- this can be tuned
+
+#define SKC_WANG_EPSILON      (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32)
+
+#define SKC_WANG_CUBIC        ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON))
+#define SKC_WANG_QUADRATIC    ((2.0f       ) / (8.0f * SKC_WANG_EPSILON))
+
+#define SKC_WANG_LENGTH(x,y)  skc_native_length(x,y)
+#define SKC_WANG_SQRT(x)      native_sqrt(x)
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
+                        SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
+                        SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y,
+                        SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y)
+{
+  //
+  // Return the number of evenly spaced (in the parametric sense) line
+  // segments that are guaranteed to be within "epsilon" error of the
+  // curve.
+  //
+  // We're then going to take multiples of the reciprocal of this
+  // number so that the segmentation can be distributed across the
+  // subgroup.
+  //
+  // Note, this can probably be slightly optimized per architecture
+  // but it's probably far from being a hotspot since it's all
+  // straight-line unpredicated code.
+  //
+  // The result is an integer ranging from [1.0,#segments]
+  //
+  // Note that even if all of the control points are coincident, the
+  // max(1.0f) will categorize this as a line of 1 segment.
+  //
+  // This is what we want!  We want to convert cubics to lines as
+  // easily as possible and *then* cull lines that are either
+  // horizontal or zero length.
+  //
+  return max(1.0f,
+             ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC *
+                                SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x),
+                                                    fabs(t3x - 2.0f * t2x + t1x)),
+                                                max(fabs(t2y - 2.0f * t1y + t0y),
+                                                    fabs(t3y - 2.0f * t2y + t1y))))));
+}
+
+static
+SKC_RASTERIZE_FLOAT
+skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
+                            SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
+                            SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y)
+{
+  return max(1.0f,
+             ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC *
+                                SKC_WANG_LENGTH(fabs(t2x - 2.0f * t1x + t0x),
+                                                fabs(t2y - 2.0f * t1y + t0y)))));
+}
+
+//
+// rational curves
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_wangs_formula_cubic_rat()
+{
+  return 0.0f;
+}
+
+static
+SKC_RASTERIZE_FLOAT
+skc_wangs_formula_quad_rat()
+{
+  return 0.0f;
+}
+
+//
+// flush any work-in-progress blocks and return unused block ids
+//
+
+static
+void
+skc_finalize(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
+             __global union skc_bp_elem                 * const bp_elems,
+             __global uint                              * const bp_ids,
+             skc_uint                                     const bp_mask,
+             __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
+             skc_block_id_v_t                           * const blocks,
+             skc_uint                                     const blocks_next,
+             skc_ttsk_v_t                               * const sk_v,
+             skc_uint                                     const sk_v_next,
+             __global skc_ttsk_s_t                      * const sk_extent,
+             __local  struct skc_subgroup_smem volatile * const smem)
+{
+  //
+  // flush non-empty bins
+  //
+  // FIXME -- accelerate this iteration/search with a subgroup operation
+  //
+  for (skc_uint ii=0; ii<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; ii++)
+    {
+      if (smem->bin.aN.count[ii] > 0)
+        {
+          skc_block_id_v_t const id  = smem->bin.aN.id[ii];
+          skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
+          skc_uint         const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()];
+#if 0
+          printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts);
+#endif
+          bp_elems[idx].u32 = tts;
+        }
+
+      //
+      // FIXME -- vectorize with vstoreN()
+      //
+    }
+
+  //
+  // return remaining block ids back to the pool
+  //
+  skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next;
+
+  if (blocks_rem > 0)
+    {
+      skc_uint bp_idx = 0;
+
+      if (skc_subgroup_lane() == 0)
+        {
+          bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem);
+
+#if 0
+          printf("r-: %8u + %u\n",bp_idx,blocks_rem);
+#endif
+        }
+
+      bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask;
+
+      if (skc_subgroup_lane() >= blocks_next)
+        {
+          bp_ids[bp_idx] = *blocks;
+        }
+    }
+
+  //
+  // flush work-in-progress ryx keys
+  //
+  if (sk_v_next > 0)
+    {
+      skc_uint sk_idx = 0;
+
+      if (skc_subgroup_lane() == 0)
+        {
+          sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
+            (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next);
+#if 0
+          printf("* %u\n",sk_idx);
+#endif
+        }
+
+      sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();
+
+      if (skc_subgroup_lane() < sk_v_next)
+        {
+          sk_extent[sk_idx] = *sk_v;
+        }
+    }
+}
+
+//
+// If there are lanes that were unable to append to a bin because
+// their hashes collided with a bin's current ryx key then those bins
+// must be ejected.
+//
+// Note that we do not eject "full" bins because lazily waiting for a
+// collision results in simpler code.
+//
+
+static
+void
+skc_flush(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
+          __global union skc_bp_elem                 * const bp_elems,
+          __global uint                              * const bp_ids,
+          skc_uint                                     const bp_mask,
+          __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
+          skc_block_id_t                             * const subblocks,
+          skc_block_id_v_t                           * const blocks,
+          skc_uint                                   * const blocks_next,
+          skc_ttsk_v_t                               * const sk_v,
+          skc_uint                                   * const sk_v_next,
+          __global skc_ttsk_s_t                      * const sk_extent,
+          __local  struct skc_subgroup_smem volatile * const smem,
+          SKC_RASTERIZE_UINT                           const hash,
+          SKC_RASTERIZE_UINT                           const yx,
+          SKC_RASTERIZE_PREDICATE                            is_collision) // pass by value
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+
+  //
+  // FIXME -- this code is now stale with the changes to the
+  // subblock/block allocation strategy
+  //
+
+  //
+  // get local TTSB ID queue count
+  //
+  skc_uint ttsb_id_count  = smem->pool.count; // scalar
+
+  // init hash bit mask
+  skc_uint component_mask = 0;
+
+  for (int cc=0; cc<SKC_RASTERIZE_VECTOR_SIZE; cc++)
+    {
+      // if no collision continue
+      if (((int*)&is_collision)[cc] == 0)
+        continue;
+
+      uint const winner        = ((uint*)&hash)[cc];
+      uint const component_bit = 1u << winner;
+
+      // if already processed this hash then continue
+      if (component_mask & component_bit)
+        continue;
+
+      // update component mask
+      component_mask |= component_bit;
+
+      //
+      // new winner requires ejecting the old TTSB
+      //
+      if (smem->bin.aN.count[winner] > 0)
+        {
+          skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
+
+          bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
+        }
+
+        //
+        // ensure there is at least one TTSK and TTSB ID
+        //
+        if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE)
+          {
+            //
+            // update remaining count
+            //
+            ttsb_id_count = 0;
+
+            //
+            // flush accumulated ttsk_ryx keys
+            //
+            uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
+              (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count
+
+#if 0
+            printf("# %u\n",idx);
+#endif
+
+            for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
+              {
+                ttsk_ryx[idx + ii] = skc_make_ttsk_ryx(smem,SKC_CMD_RASTERIZE_GET_COHORT(cmd),ii);
+              }
+
+            //
+            // allocate more ttsb ids from pool
+            //
+            uint const id = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+0,SKC_RASTERIZE_POOL_SIZE); // ring_reads
+
+            for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
+              smem->pool.aN.id[ii] = bp_ids[id + ii];
+          }
+
+      //
+      // invalidate the winning block
+      //
+
+      //
+      // update bin with winning yx, new ttsb id and zero count
+      //
+      // all lanes are loading/storing from/to the same index
+      //
+      smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID );
+      smem->bin.aN.id   [winner] = smem->pool.aN.id[ttsb_id_count];
+      smem->bin.aN.yx   [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc];
+      smem->bin.aN.count[winner] = 0;
+
+      //
+      // update count
+      //
+      ttsb_id_count += 1;
+    }
+
+  //
+  // save count
+  //
+  smem->pool.count = ttsb_id_count;
+
+#else
+  //
+  // SIMT
+  //
+
+  do {
+    //
+    // only one lane will win!
+    //
+    if (is_collision)
+      smem->subgroup.winner = hash;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //
+    // which bin is being ejected?
+    //
+    skc_uint const winner = smem->subgroup.winner;
+
+    //
+    // which colliding hash is taking over the bin?
+    //
+    SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner);
+
+    //
+    // all lanes with the same hash will try to store but only one
+    // lane will win
+    //
+    if (is_winner)
+      smem->subgroup.winner = yx;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //
+    // flush this block to the pool
+    //
+    if (smem->bin.aN.count[winner] > 0)
+      {
+        skc_block_id_v_t const id  = smem->bin.aN.id[winner];
+        skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
+        skc_uint         const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
+#if 0
+        printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts);
+#endif
+        bp_elems[idx].u32 = tts;
+      }
+
+    //
+    // append new ttsk
+    //
+    skc_uint       const new_yx = smem->subgroup.winner;
+    skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(),
+                                                    blocks_next,
+                                                    bp_atomics,
+                                                    bp_mask, // pow2 modulo mask for block pool ring
+                                                    bp_ids,
+                                                    cohort_atomics,
+                                                    sk_v,
+                                                    sk_v_next,
+                                                    sk_extent,
+                                                    new_yx);
+
+#if 0
+    if (get_sub_group_local_id() == 0) {
+      printf(">>> %9u\n",new_id);
+    }
+#endif
+
+    //
+    // update bin with winning yx, new ttsb id and zero count
+    //
+    smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID;
+    smem->bin.aN.yx   [winner]                      = new_yx;
+    smem->bin.aN.id   [winner]                      = new_id;
+    smem->bin.aN.count[winner]                      = 0;
+
+    //
+    // remove all lanes matching this hash
+    //
+    is_collision = is_collision && !is_winner;
+
+    //
+    // exit if nothing left to do
+    //
+  } while (sub_group_any(is_collision));
+
+#endif
+}
+
+//
+// scatter scan max
+//
+static
+SKC_RASTERIZE_UINT
+skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem,
+                     SKC_RASTERIZE_FLOAT                         const iss,
+                     SKC_RASTERIZE_FLOAT                         const ess)
+{
+  //
+  // prefix sums determine which lanes we're going to work on next
+  //
+  SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP);
+  SKC_RASTERIZE_UINT      const scratch_idx      = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f));
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#ifdef SKC_RASTERIZE_SIMD_USES_SMEM
+  //
+  // SIMD APPROACH 1: SIMT'ISH
+  //
+
+  // zero the volatile smem scratchpad using vector syntax
+  smem->subgroup.vN.scratch[0] = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                         \
+  if (is_scratch_store C)                               \
+    smem->subgroup.aN.scratch[scratch_idx C] = I;
+
+  SKC_RASTERIZE_VECTOR_EXPAND();
+
+  // propagate lanes to right using max scan
+  SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0];
+  SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);
+
+#else
+  //
+  // SIMD APPROACH 2: SCALAR'ISH
+  //
+
+  SKC_RASTERIZE_UINT source = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                 \
+  if (is_scratch_store C)                       \
+    ((uint *)&source)[scratch_idx C] = I;
+
+  SKC_RASTERIZE_VECTOR_EXPAND();
+
+  for (uint ii=1; ii<SKC_RASTERIZE_ELEMS_PER_SUBGROUP; ii++)
+    ((uint *)&source)[ii] = max(((uint *)&source)[ii-1],((uint *)&source)[ii]);
+#endif
+
+#else
+  //
+  // SIMT
+  //
+
+  //
+  // zero the volatile smem scratchpad using vector syntax
+  //
+  smem->subgroup.vN.scratch[skc_subgroup_lane()] = ( 0 );
+
+  //
+  // store source lane at starting lane
+  //
+  if (is_scratch_store)
+    smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane();
+
+  //
+  // propagate lanes to right using max scan
+  //
+  SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()];
+  SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);
+#endif
+
+  return source;
+}
+
+//
+// sliver lines into subpixels
+//
+
+static
+void
+skc_sliver(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
+           __global union skc_bp_elem                 * const bp_elems,
+           __global uint                              * const bp_ids,
+           skc_uint                                     const bp_mask,
+           __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
+           skc_block_id_t                             * const subblocks,
+           skc_block_id_v_t                           * const blocks,
+           skc_uint                                   * const blocks_next,
+           skc_ttsk_v_t                               * const sk_v,
+           skc_uint                                   * const sk_v_next,
+           __global skc_ttsk_s_t                      * const sk_extent,
+           __local  struct skc_subgroup_smem volatile * const smem,
+           SKC_RASTERIZE_FLOAT                          const l0x,
+           SKC_RASTERIZE_FLOAT                          const l0y,
+           SKC_RASTERIZE_FLOAT                          const l1x,
+           SKC_RASTERIZE_FLOAT                          const l1y)
+{
+  //
+  // Y-SLIVERING
+  // -----------
+  //
+  // immediately sliver all multi-pixel lines in into 1-pixel high
+  // lines
+  //
+  // note this implicitly squelches horizontal lines
+  //
+  // there is another test for horizontal lines after x-slivering
+  // is complete
+  //
+
+  //
+  // will we need to flip the sign of y_delta ?
+  //
+  SKC_RASTERIZE_PREDICATE const y_lt   = (l0y <= l1y);
+  SKC_RASTERIZE_UINT      const dy_xor = y_lt ? 0 : 0x80000000;
+
+  //
+  // save 1/dy
+  //
+  SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y);
+
+  //
+  // how many non-horizontal subpixel y-axis slivers are there?
+  //
+  SKC_RASTERIZE_FLOAT const y_min   = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
+  SKC_RASTERIZE_FLOAT const y_max   = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
+  SKC_RASTERIZE_FLOAT const y_base  = y_lt ? y_min : y_max;
+  SKC_RASTERIZE_FLOAT       y_segs  = y_max - y_min;
+
+  //
+  // inclusive subgroup scan of y_segs
+  //
+  SKC_RASTERIZE_FLOAT       y_iss   = skc_subgroup_scan_inclusive_add_float(y_segs);
+  SKC_RASTERIZE_FLOAT       y_ess   = y_iss - y_segs;
+  float                     y_rem   = skc_subgroup_last_float(y_iss);
+
+  //
+  // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails
+  //
+  if (y_segs == 0.0f)
+    y_iss = 0.0f;
+
+#if 0
+  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem);
+#endif
+
+  //
+  // these values don't matter on first iteration
+  //
+  SKC_RASTERIZE_FLOAT n1x_prev = 0;
+  SKC_RASTERIZE_FLOAT n1y_prev = 0;
+
+  //
+  // loop until done
+  //
+  while (y_rem > 0.0f)
+    {
+      //
+      // distribute work across lanes
+      //
+      SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess);
+
+      //
+      // get line at y_source line
+      //
+      SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source);
+      SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source);
+      SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source);
+      SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source);
+
+      //
+      // every lane will create a 1 pixel tall line "sliver"
+      //
+      // FIXME -- this gets expanded on SIMD
+      //
+      // if numerator == 1 then this is the first lane
+      // if numerator == s then this is the last  lane
+      //
+      SKC_RASTERIZE_FLOAT     const y_delta    = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source);
+      SKC_RASTERIZE_FLOAT     const y_count    = skc_subgroup_shuffle(y_segs,y_source);
+
+      SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f);
+      SKC_RASTERIZE_PREDICATE const is_y_last  = (y_delta >= y_count);
+
+      // toggle y_delta sign
+      SKC_RASTERIZE_FLOAT     const y_offset   = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source)));
+
+      //
+      // calculate "right" line segment endpoint
+      //
+      SKC_RASTERIZE_FLOAT       n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP;
+      SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source);
+      SKC_RASTERIZE_FLOAT       n1x = round(SKC_LERP(m0x,m1x,n_t));
+
+      //
+      // override c1 if this is last point
+      //
+      n1y = select(n1y,m1y,is_y_last);
+      n1x = select(n1x,m1x,is_y_last);
+
+      //
+      // shuffle up "left" line segment endpoint
+      //
+      // NOTE: Intel's shuffle_up is unique with its elegant
+      // "previous" argument so don't get used to it
+      //
+      SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y);
+      SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x);
+
+      //
+      // override shuffle up if this is the first line segment
+      //
+      n0y = select(n0y,m0y,is_y_first);
+      n0x = select(n0x,m0x,is_y_first);
+
+      //
+      // save previous right endpoint
+      //
+      n1x_prev = n1x;
+      n1y_prev = n1y;
+
+      //
+      // decrement by subgroup size
+      //
+      y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+
+#if 0
+      //
+      // debug
+      //
+      if (n0y != n1y) {
+        printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y);
+      }
+#endif
+
+      //
+      // X-SLIVERING
+      // -----------
+      //
+      // now sliver 1-pixel high lines into at either vertical or
+      // 1-pixel wide lines
+      //
+      // save original direction and work with increasing x
+      //
+      SKC_RASTERIZE_PREDICATE const x_lt   = (n0x <= n1x);
+      SKC_RASTERIZE_UINT      const dx_xor = x_lt ? 0 : 0x80000000;
+
+      //
+      // save 1/dy
+      //
+      SKC_RASTERIZE_FLOAT const x_denom  = native_recip(n1x - n0x);
+
+      //
+      // how many non-horizontal subpixel y-axis slivers are there?
+      //
+      SKC_RASTERIZE_FLOAT const x_min    = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
+      SKC_RASTERIZE_FLOAT const x_max    = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
+      SKC_RASTERIZE_FLOAT const x_base   = x_lt ? x_min : x_max;
+      SKC_RASTERIZE_FLOAT const x_segs   = fmax(x_max - x_min,1.0f);
+
+      //
+      // inclusive subgroup scan of y_segs
+      //
+      SKC_RASTERIZE_FLOAT       x_iss    = skc_subgroup_scan_inclusive_add_float(x_segs);
+      SKC_RASTERIZE_FLOAT       x_ess    = x_iss - x_segs;
+      float                     x_rem    = skc_subgroup_last_float(x_iss);
+
+      //
+      // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails
+      //
+      //if (x_segs == 0.0f)
+      // x_iss = 0.0f;
+
+      //
+      // these values don't matter on first iteration
+      //
+      SKC_RASTERIZE_FLOAT       p1x_prev = 0;
+      SKC_RASTERIZE_FLOAT       p1y_prev = 0;
+
+      //
+      // loop until done
+      //
+      while (x_rem > 0)
+        {
+          //
+          // distribute work across lanes
+          //
+          SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess);
+
+          //
+          // get line at y_source line
+          //
+          SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source);
+          SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source);
+          SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source);
+          SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source);
+
+          //
+          // every lane will create a 1 pixel tall line "sliver"
+          //
+          // FIXME -- this gets expanded on SIMD
+          //
+          // if numerator == 1 then this is the first lane
+          // if numerator == s then this is the last  lane
+          //
+          SKC_RASTERIZE_FLOAT     const x_delta    = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source);
+          SKC_RASTERIZE_FLOAT     const x_count    = skc_subgroup_shuffle(x_segs,x_source);
+
+          SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f);
+          SKC_RASTERIZE_PREDICATE const is_x_last  = (x_delta >= x_count);
+
+          // toggle x_delta sign
+          SKC_RASTERIZE_FLOAT     const x_offset   = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source)));
+
+          //
+          // calculate "right" line segment endpoint
+          //
+          SKC_RASTERIZE_FLOAT       p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP;
+          SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source);
+          SKC_RASTERIZE_FLOAT       p1y = round(SKC_LERP(o0y,o1y,p_t));
+
+          //
+          // override c1 if this is last point
+          //
+          p1x = select(p1x,o1x,is_x_last);
+          p1y = select(p1y,o1y,is_x_last);
+
+          //
+          // shuffle up "left" line segment endpoint
+          //
+          // NOTE: Intel's shuffle_up is unique with its elegant
+          // "previous" argument so don't get used to it
+          //
+          SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x);
+          SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y);
+
+          //
+          // override shuffle up if this is the first line segment
+          //
+          p0x = select(p0x,o0x,is_x_first);
+          p0y = select(p0y,o0y,is_x_first);
+
+          //
+          // save previous right endpoint
+          //
+          p1x_prev = p1x;
+          p1y_prev = p1y;
+
+          //
+          // decrement by subgroup size
+          //
+          x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+          x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+          x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+
+          //
+          // only non-horizontal subpixel lines are valid
+          //
+          SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y);
+
+          //
+          // if no lanes are active then continue
+          //
+          // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY
+          // IMPACTS PERFORMANCE (+12% ?)
+          //
+          // IT SHOULDN'T !!!
+          //
+#if 0
+          if (!skc_subgroup_any(is_active))
+            continue;
+#endif
+
+          //
+          // Option 1: use SLM for explicitly managed coalesced stores
+          //
+          // 1. which tile does this line belong?
+          // 2. hash tile coordinates
+          // 3. lookup hash
+          // 4. if tile matches then SLM append keys
+          // 5. if tile doesn't match
+          //   a. flush
+          //   b. create new TTSK_RYX
+          //   c. obtain TTSB block from pool
+          //   d. goto 3.
+          //
+
+          //
+          // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores
+          //
+          // 1. which tile does this line belong?
+          // 2. hash tile coordinates
+          // 3. lookup hash
+          // 4. if tile matches then GMEM append keys
+          // 5. if tile doesn't match
+          //   a. flush (and invalidate empty elems)
+          //   b. create new TTSK_RYX
+          //   c. obtain TTSB block from pool
+          //   d. goto 3.
+          //
+
+          //
+          // The virtual rasterization surface is very large and
+          // signed: +/- ~64K-256K, depending on the architecture.
+          //
+          // Rasters must be clipped to the virtual surface and,
+          // optionally, clipped even further on a per raster
+          // basis.
+          //
+
+          //
+          // Clip to the per-raster clip
+          //
+
+          /*
+
+            CLIP HERE
+
+          */
+
+          //
+          // Hash the tile coordinates
+          //
+          // This table lists nominal values for each architecture.
+          // We want to choose values that are naturally fit the
+          // "width" of the architecture.
+          //
+          //   SIMD   RANGE   BITS  MAX RANGE  MAX BINS  HASH BITS
+          //   ----  -------  ----  ---------  --------  ---------
+          //     4   [0,  4]    3    [0,  7]      10      mod(10)  <-- SSE42, ?
+          //     8   [0,  8]    4    [0, 15]       8         3     <-- GEN*,AVX*
+          //    16   [0, 16]    5    [0, 31]       6      mod(6)   <-- GEN*,?
+          //    32   [0, 32]    6    [0, 63]       5      mod(5)   <-- CUDA,PowerVR,Adreno,GEN*
+          //    64   [0, 64]    7    [0,127]       4         2     <-- AMD Radeon
+          //
+          // NOTE: When possible, bias the hash toward using more y
+          // bits because of:
+          //
+          //   1. the 90 degree counter-clockwise rotation that we put
+          //      in place to offset the render-time clockwise
+          //      rotation
+          //
+          //   2. the likely presence of left-to-right or
+          //      right-to-left glyphs.
+          //
+          // For power-of-two bins, the hash is easy.
+          //
+          // For non-power-of-two, we may want to either implement a
+          // fast mod (compiler should do this for us... hahahaha) or
+          // drop down to the next power-of-two.
+          //
+
+          //
+          // FIXME -- this snarl is not good -- can probably reduce
+          // some of the sign casting but some is there to vectorize a
+          // scalar
+          //
+          SKC_RASTERIZE_INT       const z0y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y);
+          SKC_RASTERIZE_INT       const z1y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y);
+
+          SKC_RASTERIZE_INT       const z0x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x);
+          SKC_RASTERIZE_INT       const z1x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x);
+
+          SKC_RASTERIZE_INT       const min_y  = min(z0y,z1y);
+          SKC_RASTERIZE_INT       const max_y  = max(z0y,z1y);
+
+          SKC_RASTERIZE_INT       const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2;
+
+          SKC_RASTERIZE_UINT      const ty     = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y;
+          SKC_RASTERIZE_INT             dy     = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y);
+
+          //
+          // map [+1,+32] to [ 0,+31]
+          // map [-1,-32] to [-1,-32]
+          //
+          SKC_RASTERIZE_INT             dys    = (dy + (~dy >> 31)) << 26;
+
+          SKC_RASTERIZE_INT       const min_x  = min(z0x,z1x);
+          SKC_RASTERIZE_INT       const max_x  = max(z0x,z1x);
+          SKC_RASTERIZE_INT       const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2;
+
+          SKC_RASTERIZE_UINT      const tx     = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X;
+          SKC_RASTERIZE_UINT      const sx     = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x);
+
+          SKC_RASTERIZE_UINT      const tts    = dys | (ty << 16) | (sx << 10) | tx;
+
+          SKC_RASTERIZE_UINT      const hash   = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) |
+                                                   (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK));
+
+          SKC_RASTERIZE_UINT      const yx     = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF));
+
+#if 0
+          printf("(%3u, %3u)\n",tile_y,tile_x);
+#endif
+
+#if 0
+          if (is_active)
+            printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx);
+#endif
+
+          //
+          // debug
+          //
+#if 0 // PRINTF_ENABLE
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                                         \
+          if (is_active C)                                              \
+            printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C);
+
+          SKC_RASTERIZE_VECTOR_EXPAND();
+#else
+          if (is_active)
+            printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash);
+#endif
+
+#endif
+          //
+          // flush all active lanes
+          //
+          while (true)
+            {
+              //
+              // either gather load or vector load+shuffle the yx keys
+              //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+              SKC_RASTERIZE_BIN       const yx_bin     = smem->bin.vN.yx;
+              SKC_RASTERIZE_UINT      const yx_cur     = shuffle(yx_bin,hash);
+#else
+              SKC_RASTERIZE_UINT      const yx_cur     = smem->bin.aN.yx[hash];
+#endif
+
+              //
+              // does yx for lane match yx for hash?
+              //
+              SKC_RASTERIZE_UINT      const active_yx  = is_active ? yx : SKC_RASTERIZE_YX_INVALID;
+              SKC_RASTERIZE_PREDICATE const is_match   = (yx_cur == active_yx);
+
+              //
+              // OpenCL spec: "When casting a bool to a vector integer
+              // data type, the vector components will be set to -1
+              // (i.e. all bits set) if the vector bool value is true
+              // and 0 otherwise.
+              //
+#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+              SKC_RASTERIZE_UINT      const h_match    = (SKC_RASTERIZE_UINT)is_match;
+#else
+              SKC_RASTERIZE_UINT      const h_match    = abs(is_match); // {-1,0} -> {+1,0}
+#endif
+              //
+              // how many new elements for each matching hash bin?
+              //
+              SKC_RASTERIZE_UINT      const h_shl      = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS;
+              SKC_RASTERIZE_UINT      const h          = h_match << h_shl;
+
+              //
+              // prefix sum all of the bins in parallel
+              //
+              SKC_RASTERIZE_UINT      const h_iss      = skc_subgroup_scan_inclusive_add_uint(h);
+              SKC_RASTERIZE_UINT      const h_total    = skc_subgroup_last_uint(h_iss);
+
+              //
+              // current bin counts
+              //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+              SKC_RASTERIZE_BIN       const count_bin  = smem->bin.vN.count;
+              SKC_RASTERIZE_UINT      const count_cur  = shuffle(count_bin,hash);
+#else
+              SKC_RASTERIZE_UINT      const count_cur  = smem->bin.aN.count[hash];
+#endif
+
+              //
+              // calculate where each cache-hit and in-bounds tts should be stored
+              //
+              SKC_RASTERIZE_UINT      const ttsb_index = (h_iss   >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1;
+              SKC_RASTERIZE_UINT      const count_new  = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur;
+
+              //
+              // which lanes can append to a matching bin?
+              //
+              SKC_RASTERIZE_PREDICATE const is_append  = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS);
+
+              //
+              // scatter append tts elements to bin blocks
+              //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
+              //
+              // SIMD
+              //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                                         \
+              if (is_append C)                                          \
+                {                                                       \
+                  smem->bin.aN.ttsb [hash C][ttsb_index C] = tts       C; \
+                  smem->bin.aN.count[hash C]               = count_new C; \
+                }
+
+              SKC_RASTERIZE_VECTOR_EXPAND();
+#else
+              //
+              // SIMT
+              //
+              if (is_append)
+                {
+                  smem->bin.aN.ttsb [hash][ttsb_index] = tts;
+                  smem->bin.aN.count[hash]             = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS
+                }
+#endif
+              //
+              // try to keep predicate updates SIMD-friendly and
+              // outside of predicated code paths -- this is not
+              // always how we would normally do things on SIMT but
+              // either approach is acceptable
+              //
+
+              //
+              // mask off lanes/components that successfully appended
+              //
+              is_active = is_active && !is_append;
+
+              //
+              // are there any active lanes left?
+              //
+              if (!skc_subgroup_any(is_active))
+                break;
+
+              //
+              // There are active lanes that couldn't be appended to a
+              // bin because their hashes collided with the bin's
+              // current ryx key then those bins must be ejected.
+              //
+              // Note that we do not eject "full" bins because lazily
+              // waiting for a collision results in simpler code.
+              //
+              skc_flush(bp_atomics,
+                        bp_elems,
+                        bp_ids,
+                        bp_mask,
+                        cohort_atomics,
+                        subblocks,
+                        blocks,
+                        blocks_next,
+                        sk_v,
+                        sk_v_next,
+                        sk_extent,
+                        smem,
+                        hash,
+                        yx,
+                        is_active);
+            }
+        }
+    }
+}
+
+//
+// INITIALIZE SMEM
+//
+// Note that SIMD/SIMT have nearly the same syntax.
+//
+static
+void
+skc_smem_init(__local struct skc_subgroup_smem volatile * const smem)
+{
+  //
+  // initialize smem bins
+  //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  smem->bin.vN.yx    = ( SKC_RASTERIZE_YX_INIT );
+  smem->bin.vN.count = ( 0 );
+#else
+  //
+  // SIMT
+  //
+  int idx = skc_subgroup_lane();
+
+#if   ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
+  if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT)
+#elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
+  for (; idx<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; idx+=SKC_RASTERIZE_SUBGROUP_SIZE)
+#endif
+    {
+      smem->bin.aN.yx   [idx] = ( SKC_RASTERIZE_YX_INIT );
+      smem->bin.aN.count[idx] = ( 0 );
+    }
+#endif
+}
+
+//
+// RASTERIZE CUBIC KERNEL
+//
+
+static
+void
+skc_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                     __global union skc_bp_elem                * const bp_elems,
+                     __global uint                             * const bp_ids,
+                     skc_uint                                    const bp_mask,
+
+                     __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                     __global skc_ttsk_s_t                     * const sk_extent,
+
+                     __local struct skc_subgroup_smem volatile * const smem,
+
+                     skc_uint                                  * const nodeword,
+                     skc_block_id_t                            * const id,
+
+                     union skc_transform              const    * const tv,
+                     union skc_path_clip              const    * const cv,
+                     skc_uint                                    const cohort)
+{
+  //
+  // the initial segment idx and segments-per-block constant determine
+  // how many block ids will need to be loaded
+  //
+  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  //
+  // apply transform
+  //
+  // note that we only care if the end points are rounded to subpixel precision
+  //
+  // FIXME -- transformation is currently affine-only support perspective later
+  //
+  // the affine transformation requires 8 FMA + 2 ROUND operations
+  //
+  SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);
+
+  SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
+  SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;
+
+  SKC_RASTERIZE_FLOAT const t2x = c2x * tv->sx  + c2y * tv->shx + tv->tx;
+  SKC_RASTERIZE_FLOAT const t2y = c2x * tv->shy + c2y * tv->sy  + tv->ty;
+
+  SKC_RASTERIZE_FLOAT const t3x = round(c3x * tv->sx  + c3y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const t3y = round(c3x * tv->shy + c3y * tv->sy  + tv->ty);
+
+  //
+  //
+  //
+#if PRINTF_ENABLE
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                                         \
+  printf("{ { %.02f, %.02f }, { %.02f, %.02f },"                        \
+         "  { %.02f, %.02f }, { %.02f, %.02f } },\n",                   \
+         b0x C,b0y C,t1x C,t1y C,                                       \
+         t2x C,t2y C,t3x C,t3y C);
+
+  SKC_RASTERIZE_VECTOR_EXPAND();
+
+#else
+
+  printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n",
+         b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);
+
+#endif
+
+#endif
+
+  //
+  // OLD APPROACH
+  // ------------
+  //
+  // The Spinel CUDA rasterizer was significantly more complex and
+  // performed a few different tasks that are probably best kept
+  // separate.
+  //
+  // The Spinel rasterizer Bezier held 4-element x and y coordinates
+  // in adjacent lanes. This simplified intermingling of single lane
+  // 4-coordinate line segments with two-lane cubic Beziers.
+  //
+  // After transformation of the input segments, the Spinel rasterizer
+  // would test cubics for flatness and, if flat, collapse the
+  // adjacent lanes into a single line lane and an empty lane.
+  //
+  // Any lines would then be appended to a line queue.
+  //
+  // Any cubics would then be subdivided.
+  //
+  // The reclassification process would be repeated.
+  //
+  // NEW APPROACH
+  // ------------
+  //
+  // Assume we're only working with cubics in this kernel.
+  //
+  // Optimization: if the line segment is a special case -- a cusp,
+  // has 1+ inflections, or a loop -- it might be beneficial to
+  // subdivide the control cage 1+ times in order to separate the
+  // flatter segments the high-velocity region(s).
+  //
+  // This means we want to split using [a,b] formulation to _directly_
+  // subdivide producing a new control cage.
+  //
+  // Wang's Formula is still useful even if we subdivide once or twice
+  // as it's so cheap that it might give some useful hints about where
+  // the high-velocity sections of curve reside.
+  //
+  // But it seems like using Wang's and directly flattening to line
+  // segments without any subdivision is good enough for the limited
+  // set of test cases that I've tried.
+  //
+  // So... use Wang's Formula to estimate how many line segment are
+  // required to properly flatten the cubics.
+  //
+  // Then use inclusive/exclusive scans to put all the lanes to work:
+  //
+  //   1. segmenting cubics to line segments
+  //
+  //   2. slivering line segments into 1-pixel high line segments
+  //
+  //   3. slivering 1-pixel high line segments into 1-pixel wide line
+  //      segments
+  //
+  // MORE BACKGROUND ON NEW APPROACH
+  // -------------------------------
+  //
+  // Two options for handling line segments:
+  //
+  // 1. append the line segments onto an SLM array until enough
+  //    work has been accrued (Spinel does this)
+  //
+  // 2. immediately sliver the potentially multi-pixel line
+  //    segments into subpixel lines
+  //
+  // The advantage of (1) is that it guarantees the slivering
+  // process will, on average, always be emitting a full subgroup
+  // of subpixel lines.
+  //
+  // The advantage of (2) is that it reduces code complexity and
+  // leaves more room for SLM tile bins. The difference between Spinel
+  // and Skia Compute is that Wang's Formula guarantees there will be
+  // a full subgroup of multi-pixel lines unless this is the final
+  // iteration of the warp of multi-pixel lines.
+  //
+  // Note that wider GPU architectures might benefit from (1) and
+  // other work accumulation strategies because it will minimize
+  // partial warp workloads in the final iteration of each stage.  It
+  // also minimizes the sunk cost of the uniform control logic steps.
+  //
+  // So let's implement (2) for now...
+  //
+
+  //
+  // And... begin!
+  //
+  // Estimate how many line segments are in quad/cubic curve.
+  //
+  // Wang's Formula will return zero if the control points are
+  // collinear but we bump it up to 1.0f.
+  //
+  SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);
+
+  //
+  // if there are free registers then precalculate the reciprocal for
+  // each estimated segments since it will never change
+  //
+  SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);
+
+
+  //
+  // inclusive add scan of estimated line segments
+  // exclusive add scan of estimated line segments
+  // total number       of estimated line segments
+  //
+  SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);
+  SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;
+  float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar
+
+  //
+  // Precompute cubic polynomial coefficients from transformed control
+  // cage so we can shuffle them in on each iteration of the outer
+  // loop and then evaluate the polynomial in Horner form.
+  //
+  //                            |  1  0  0  0 | | c0 |
+  //                            |             | |    |
+  //                            | -3  3  0  0 | | c1 |
+  //   B(t) = [ 1 t^1 t^2 t^3 ] |             | |    |
+  //                            |  3 -6  3  0 | | c2 |
+  //                            |             | |    |
+  //                            | -1  3 -3  1 | | c3 |
+  //
+  //
+  SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x);                // 2 - 1 MAD + MUL
+  SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y);                // 2 - 1 MAD + MUL
+
+  SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x));  // 3 - 2 MAD + MUL
+  SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y));  // 3 - 2 MAD + MUL
+
+  SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB
+  SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB
+
+  //
+  // these values don't matter on the first iteration
+  //
+  SKC_RASTERIZE_FLOAT l1x_prev  = 0;
+  SKC_RASTERIZE_FLOAT l1y_prev  = 0;
+
+  //
+  // allocate and init in-register TTSK keys
+  //
+  skc_uint     sk_v_next = 0;
+  skc_ttsk_v_t sk_v; 
+
+  sk_v.hi = cohort;
+
+  //
+  // initialize smem
+  //
+  skc_smem_init(smem);
+
+  //
+  // initialize blocks / subblocks
+  //
+  skc_block_id_v_t blocks;
+  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
+
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+  skc_block_id_t   subblocks   = 0;
+#endif
+
+  //
+  // loop until done
+  //
+  while (s_rem > 0)
+    {
+      //
+      // distribute work across lanes
+      //
+      SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);
+
+      //
+      // every lane has a fraction to work off of
+      //
+      // FIXME -- this gets expanded on SIMD
+      //
+      // if delta == 1      then this is the first lane
+      // if count == s_segs then this is the last  lane
+      //
+      SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
+      SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);
+
+      SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
+      SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);
+
+      //
+      // init parametric t
+      //
+      SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?
+
+      //
+      // if last then override to a hard 1.0f
+      //
+      s_t    = is_s_last ? 1.0f : s_t;
+
+      //
+      // decrement by subgroup size
+      //
+      s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+
+      //
+      // now every lane knows what to do and the following lines will
+      // pump out up to SUBGROUP_SIZE line segments
+      //
+      // obtain the src vertices through shared or via a shuffle
+      //
+
+      //
+      // shuffle in the polynomial coefficients their source lane
+      //
+      SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
+      SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);
+
+      SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
+      SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);
+
+      SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
+      SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);
+
+      SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source);
+      SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source);
+
+      //
+      // calculate "right" line segment endpoint using Horner form
+      //
+      SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND
+      SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND
+
+      //
+      // shuffle up "left" line segment endpoint
+      //
+      // NOTE: Intel's shuffle_up is unique with its elegant
+      // "previous" argument so don't get used to it
+      //
+      SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
+      SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);
+
+      //
+      // save previous right endpoint
+      //
+      l1x_prev = l1x;
+      l1y_prev = l1y;
+
+      //
+      // override shuffle up if this is the first line segment
+      //
+      l0x = select(l0x,s0x,is_s_first);
+      l0y = select(l0y,s0y,is_s_first);
+
+      //
+      // sliver lines
+      //
+      skc_sliver(bp_atomics,
+                 bp_elems,
+                 bp_ids,
+                 bp_mask,
+                 cohort_atomics,
+                 &subblocks,
+                 &blocks,
+                 &blocks_next,
+                 &sk_v,
+                 &sk_v_next,
+                 sk_extent,
+                 smem,
+                 l0x,l0y,l1x,l1y);
+    }
+
+  //
+  // - flush work-in-progress blocks
+  // - return unused block ids
+  //
+  skc_finalize(bp_atomics,
+               bp_elems,
+               bp_ids,
+               bp_mask,
+               cohort_atomics,
+               &blocks,
+               blocks_next,
+               &sk_v,
+               sk_v_next,
+               sk_extent,
+               smem);
+}
+
+//
+// RASTERIZE QUAD KERNEL
+//
+
+static
+void
+skc_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                    __global union skc_bp_elem                * const bp_elems,
+                    __global uint                             * const bp_ids,
+                    skc_uint                                    const bp_mask,
+
+                    __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                    __global skc_ttsk_s_t                     * const sk_extent,
+
+                    __local struct skc_subgroup_smem volatile * const smem,
+                    
+                    skc_uint                                  * const nodeword,
+                    skc_block_id_t                            * const id,
+
+                    union skc_transform              const    * const tv,
+                    union skc_path_clip              const    * const cv,
+                    skc_uint                                    const cohort)
+{
+  //
+  // the initial segment idx and segments-per-block constant determine
+  // how many block ids will need to be loaded
+  //
+  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  //
+  // apply transform
+  //
+  // note that we only care if the end points are rounded to subpixel precision
+  //
+  // FIXME -- transformation is currently affine-only support perspective later
+  //
+  // the affine transformation requires 8 FMA + 2 ROUND operations
+  //
+  SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);
+
+  SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
+  SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;
+
+  SKC_RASTERIZE_FLOAT const t2x = round(c2x * tv->sx  + c2y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const t2y = round(c2x * tv->shy + c2y * tv->sy  + tv->ty);
+
+  //
+  // Estimate how many line segments are in quad/cubic curve.
+  //
+  // Wang's Formula will return zero if the control points are
+  // collinear but we bump it up to 1.0f.
+  //
+  SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y);
+
+  //
+  // if there are free registers then precalculate the reciprocal for
+  // each estimated segments since it will never change
+  //
+  SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);
+
+
+  //
+  // inclusive add scan of estimated line segments
+  // exclusive add scan of estimated line segments
+  // total number       of estimated line segments
+  //
+  SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);
+  SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;
+  float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar
+
+  //
+  // Precompute quadratic polynomial coefficients from control cage so
+  // we can shuffle them in on each iteration of the outer loop and
+  // then evaluate the polynomial in Horner form.
+  //
+
+  //                        |  1  0  0  | | c0 |
+  //                        |           | |    |
+  //   B(t) = [ 1 t^1 t^2 ] | -2  2  0  | | c1 |
+  //                        |           | |    |
+  //                        |  1 -2  1  | | c2 |
+  //
+  //
+  SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL
+  SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL
+
+  SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x);  // 2 - 1 MAD + ADD
+  SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y);  // 2 - 1 MAD + ADD
+
+  //
+  // these values don't matter on the first iteration
+  //
+  SKC_RASTERIZE_FLOAT l1x_prev  = 0;
+  SKC_RASTERIZE_FLOAT l1y_prev  = 0;
+
+  //
+  // allocate and init in-register TTSK keys
+  //
+  skc_uint     sk_v_next = 0;
+  skc_ttsk_v_t sk_v; 
+
+  sk_v.hi = cohort;
+
+  //
+  // initialize smem
+  //
+  skc_smem_init(smem);
+
+  //
+  // initialize blocks / subblocks
+  //
+  skc_block_id_v_t blocks;
+  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
+
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+  skc_block_id_t   subblocks   = 0;
+#endif
+
+  //
+  // loop until done
+  //
+  while (s_rem > 0)
+    {
+      //
+      // distribute work across lanes
+      //
+      SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);
+
+      //
+      // every lane has a fraction to work off of
+      //
+      // FIXME -- this gets expanded on SIMD
+      //
+      // if delta == 1      then this is the first lane
+      // if count == s_segs then this is the last  lane
+      //
+      SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
+      SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);
+
+      SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
+      SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);
+
+      //
+      // init parametric t
+      //
+      SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?
+
+      //
+      // if last then override to a hard 1.0f
+      //
+      s_t    = is_s_last ? 1.0f : s_t;
+
+      //
+      // decrement by subgroup size
+      //
+      s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+
+      //
+      // now every lane knows what to do and the following lines will
+      // pump out up to SUBGROUP_SIZE line segments
+      //
+      // obtain the src vertices through shared or via a shuffle
+      //
+
+      //
+      // shuffle in the polynomial coefficients their source lane
+      //
+      SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
+      SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);
+
+      SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
+      SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);
+
+      SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
+      SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);
+
+      //
+      // calculate "right" line segment endpoint using Horner form
+      //
+      SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND
+      SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND
+
+      //
+      // shuffle up "left" line segment endpoint
+      //
+      // NOTE: Intel's shuffle_up is unique with its elegant
+      // "previous" argument so don't get used to it
+      //
+      SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
+      SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);
+
+      //
+      // save previous right endpoint
+      //
+      l1x_prev = l1x;
+      l1y_prev = l1y;
+
+      //
+      // override shuffle up if this is the first line segment
+      //
+      l0x = select(l0x,s0x,is_s_first);
+      l0y = select(l0y,s0y,is_s_first);
+
+      //
+      // sliver lines
+      //
+      skc_sliver(bp_atomics,
+                 bp_elems,
+                 bp_ids,
+                 bp_mask,
+                 cohort_atomics,
+                 &subblocks,
+                 &blocks,
+                 &blocks_next,
+                 &sk_v,
+                 &sk_v_next,
+                 sk_extent,
+                 smem,
+                 l0x,l0y,l1x,l1y);
+    }
+
+  //
+  // - flush work-in-progress blocks
+  // - return unused block ids
+  //
+  skc_finalize(bp_atomics,
+               bp_elems,
+               bp_ids,
+               bp_mask,
+               cohort_atomics,
+               &blocks,
+               blocks_next,
+               &sk_v,
+               sk_v_next,
+               sk_extent,
+               smem);
+}
+
+//
+// RASTERIZE LINE KERNEL
+//
+
+static
+void
+skc_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                    __global union skc_bp_elem                * const bp_elems,
+                    __global uint                             * const bp_ids,
+                    skc_uint                                    const bp_mask,
+
+                    __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                    __global skc_ttsk_s_t                     * const sk_extent,
+
+                    __local struct skc_subgroup_smem volatile * const smem,
+                    
+                    skc_uint                                  * const nodeword,
+                    skc_block_id_t                            * const id,
+
+                    union skc_transform              const    * const tv,
+                    union skc_path_clip              const    * const cv,
+                    skc_uint                                    const cohort)
+{
+  //
+  // the initial segment idx and segments-per-block constant determine
+  // how many block ids will need to be loaded
+  //
+  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+#if 0
+  // printf("%5u : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",(skc_uint)get_global_id(0),c0x,c0y,c1x,c1y);
+  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y);
+#endif
+
+  //
+  // apply transform
+  //
+  // note that we only care if the end points are rounded to subpixel precision
+  //
+  // FIXME -- transformation is currently affine-only
+  // FIXME -- support perspective later
+  //
+  // the affine transformation requires 8 FMA + 4 ROUND operations
+  //
+  SKC_RASTERIZE_FLOAT const l0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const l0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);
+
+  SKC_RASTERIZE_FLOAT const l1x = round(c1x * tv->sx  + c1y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const l1y = round(c1x * tv->shy + c1y * tv->sy  + tv->ty);
+
+#if 0
+  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y);
+#endif
+
+  //
+  // allocate and init in-register TTSK keys
+  //
+  skc_uint     sk_v_next = 0;
+  skc_ttsk_v_t sk_v; 
+
+  sk_v.hi = cohort;
+
+  //
+  // initialize smem
+  //
+  skc_smem_init(smem);
+
+  //
+  // initialize blocks / subblocks
+  //
+  skc_block_id_v_t blocks;
+  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
+
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+  skc_block_id_t   subblocks   = 0;
+#endif
+
+  //
+  // sliver lines
+  //
+  skc_sliver(bp_atomics,
+             bp_elems,
+             bp_ids,
+             bp_mask,
+             cohort_atomics,
+             &subblocks,
+             &blocks,
+             &blocks_next,
+             &sk_v,
+             &sk_v_next,
+             sk_extent,
+             smem,
+             l0x,l0y,l1x,l1y);
+
+  //
+  // - flush work-in-progress blocks
+  // - return unused block ids
+  //
+  skc_finalize(bp_atomics,
+               bp_elems,
+               bp_ids,
+               bp_mask,
+               cohort_atomics,
+               &blocks,
+               blocks_next,
+               &sk_v,
+               sk_v_next,
+               sk_extent,
+               smem);
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                         __global union skc_bp_elem                * const bp_elems,
+                         __global uint                             * const bp_ids,
+                         skc_uint                                    const bp_mask,
+
+                         __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                         __global skc_ttsk_s_t                     * const sk_extent,
+
+                         __global float8                  const    * const transforms, // FIXME -- __constant
+                         __global float4                  const    * const clips,      // FIXME -- __constant
+                         __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                         skc_uint                                    const count)
+{
+  //
+  // declare shared memory block
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  __local struct skc_subgroup_smem volatile                smem[1];
+#else
+  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
+  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+  
+  //
+  // this is a subgroup/warp-centric kernel
+  //
+  // which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+  // get_group_id(0) as a uniform but the alternative calculation used
+  // when there are multiple subgroups per workgroup is not
+  // cooperating and driving spillage elsewhere.
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  uint const cmd_idx = get_group_id(0);
+#else
+  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("+cmd_idx = %u\n",cmd_idx);
+#endif
+
+  //
+  // if worksgroups are multi-subgroup then there may be excess
+  // subgroups in the final workgroup
+  //
+  if (cmd_idx >= count)
+    return;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("-cmd_idx = %u\n",cmd_idx);
+#endif
+
+  //
+  // load a single command for this subgroup
+  //
+  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("[ %u ]< %u, %u, %u, %u >\n",
+           cmd_idx,
+           cmd.nodeword,
+           SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd),
+           SKC_CMD_RASTERIZE_GET_CLIP(cmd),
+           SKC_CMD_RASTERIZE_GET_COHORT(cmd));
+#endif
+
+  //
+  // get first block node command word and its subblock
+  //
+  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
+  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
+  skc_block_id_tag      tag      = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id);
+  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+
+  //
+  // load transform -- uniform across subgroup
+  //
+  // v8: { sx shx tx shy sy ty w0 w1 }
+  //
+  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
+  //
+  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
+  //
+  // Coordinates are scaled to subpixel resolution.  All that matters
+  // is that continuity is maintained between end path element
+  // endpoints.
+  //
+  // It's the responsibility of the host to ensure that the transforms
+  // are properly scaled either via intitializing a transform stack
+  // with the subpixel resolution scaled identity or scaling the
+  // transform before its loaded by a rasterization grid.
+  //
+  // FIXME -- horizontal load might be better than this broadcast load
+  //
+  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
+  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
+  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
+
+  switch (tag)
+    {
+    case SKC_BLOCK_ID_TAG_PATH_LINE:
+      skc_rasterize_lines(bp_atomics,
+                          bp_elems,
+                          bp_ids,
+                          bp_mask,
+                          cohort_atomics,
+                          sk_extent,
+                          smem,
+                          &nodeword,&id,
+                          &tv,&cv,cohort);
+      break;
+
+    case SKC_BLOCK_ID_TAG_PATH_QUAD:
+      skc_rasterize_quads(bp_atomics,
+                          bp_elems,
+                          bp_ids,
+                          bp_mask,
+                          cohort_atomics,
+                          sk_extent,
+                          smem,
+                          &nodeword,&id,
+                          &tv,&cv,cohort);
+      break;
+
+    case SKC_BLOCK_ID_TAG_PATH_CUBIC:
+      skc_rasterize_cubics(bp_atomics,
+                           bp_elems,
+                           bp_ids,
+                           bp_mask,
+                           cohort_atomics,
+                           sk_extent,
+                           smem,
+                           &nodeword,&id,
+                           &tv,&cv,cohort);
+      break;
+
+    case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD:
+      break;
+    case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC:
+      break;
+
+    default:
+      break;
+    }
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                           __global union skc_bp_elem                * const bp_elems,
+                           __global uint                             * const bp_ids,
+                           skc_uint                                    const bp_mask,
+
+                           __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                           __global skc_ttsk_s_t                     * const sk_extent,
+
+                           __global float8                  const    * const transforms, // FIXME -- __constant
+                           __global float4                  const    * const clips,      // FIXME -- __constant
+                           __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                           skc_uint                                    const count)
+{
+  //
+  // declare shared memory block
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  __local struct skc_subgroup_smem volatile                smem[1];
+#else
+  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
+  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+  
+  //
+  // this is a subgroup/warp-centric kernel
+  //
+  // which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+  // get_group_id(0) as a uniform but the alternative calculation used
+  // when there are multiple subgroups per workgroup is not
+  // cooperating and driving spillage elsewhere.
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  uint const cmd_idx = get_group_id(0);
+#else
+  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+  //
+  // if worksgroups are multi-subgroup then there may be excess
+  // subgroups in the final workgroup
+  //
+  if (cmd_idx >= count)
+    return;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("cmd_idx = %u\n",cmd_idx);
+#endif
+
+  //
+  // load a single command for this subgroup
+  //
+  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
+
+  //
+  // get first block node command word and its subblock
+  //
+  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
+  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
+  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+
+  //
+  // load transform -- uniform across subgroup
+  //
+  // v8: { sx shx tx shy sy ty w0 w1 }
+  //
+  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
+  //
+  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
+  //
+  // Coordinates are scaled to subpixel resolution.  All that matters
+  // is that continuity is maintained between end path element
+  // endpoints.
+  //
+  // It's the responsibility of the host to ensure that the transforms
+  // are properly scaled either via intitializing a transform stack
+  // with the subpixel resolution scaled identity or scaling the
+  // transform before its loaded by a rasterization grid.
+  //
+  // FIXME -- horizontal load might be better than this broadcast load
+  //
+  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
+  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
+  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
+
+  skc_rasterize_lines(bp_atomics,
+                      bp_elems,
+                      bp_ids,
+                      bp_mask,
+                      cohort_atomics,
+                      sk_extent,
+                      smem,
+                      &nodeword,&id,
+                      &tv,&cv,cohort);
+}
+
+//
+//
+//
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                           __global union skc_bp_elem                * const bp_elems,
+                           __global uint                             * const bp_ids,
+                           skc_uint                                    const bp_mask,
+
+                           __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                           __global skc_ttsk_s_t                     * const sk_extent,
+
+                           __global float8                  const    * const transforms, // FIXME -- __constant
+                           __global float4                  const    * const clips,      // FIXME -- __constant
+                           __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                           skc_uint                                    const count)
+{
+  //
+  // declare shared memory block
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  __local struct skc_subgroup_smem volatile                smem[1];
+#else
+  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
+  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+  
+  //
+  // this is a subgroup/warp-centric kernel
+  //
+  // which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+  // get_group_id(0) as a uniform but the alternative calculation used
+  // when there are multiple subgroups per workgroup is not
+  // cooperating and driving spillage elsewhere.
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  uint const cmd_idx = get_group_id(0);
+#else
+  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+  //
+  // if worksgroups are multi-subgroup then there may be excess
+  // subgroups in the final workgroup
+  //
+  if (cmd_idx >= count)
+    return;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("cmd_idx = %u\n",cmd_idx);
+#endif
+
+  //
+  // load a single command for this subgroup
+  //
+  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
+
+  //
+  // get first block node command word and its subblock
+  //
+  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
+  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
+  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+
+  //
+  // load transform -- uniform across subgroup
+  //
+  // v8: { sx shx tx shy sy ty w0 w1 }
+  //
+  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
+  //
+  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
+  //
+  // Coordinates are scaled to subpixel resolution.  All that matters
+  // is that continuity is maintained between end path element
+  // endpoints.
+  //
+  // It's the responsibility of the host to ensure that the transforms
+  // are properly scaled either via intitializing a transform stack
+  // with the subpixel resolution scaled identity or scaling the
+  // transform before its loaded by a rasterization grid.
+  //
+  // FIXME -- horizontal load might be better than this broadcast load
+  //
+  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
+  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
+  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
+
+  skc_rasterize_quads(bp_atomics,
+                      bp_elems,
+                      bp_ids,
+                      bp_mask,
+                      cohort_atomics,
+                      sk_extent,
+                      smem,
+                      &nodeword,&id,
+                      &tv,&cv,cohort);
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                            __global union skc_bp_elem                * const bp_elems,
+                            __global uint                             * const bp_ids,
+                            skc_uint                                    const bp_mask,
+
+                            __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                            __global skc_ttsk_s_t                     * const sk_extent,
+
+                            __global float8                  const    * const transforms, // FIXME -- __constant
+                            __global float4                  const    * const clips,      // FIXME -- __constant
+                            __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                            skc_uint                                    const count)
+{
+  //
+  // declare shared memory block
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  __local struct skc_subgroup_smem volatile                smem[1];
+#else
+  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
+  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+  
+  //
+  // this is a subgroup/warp-centric kernel
+  //
+  // which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+  // get_group_id(0) as a uniform but the alternative calculation used
+  // when there are multiple subgroups per workgroup is not
+  // cooperating and driving spillage elsewhere.
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  uint const cmd_idx = get_group_id(0);
+#else
+  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+  //
+  // if worksgroups are multi-subgroup then there may be excess
+  // subgroups in the final workgroup
+  //
+  if (cmd_idx >= count)
+    return;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("cmd_idx = %u\n",cmd_idx);
+#endif
+
+  //
+  // load a single command for this subgroup
+  //
+  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
+
+  //
+  // get first block node command word and its subblock
+  //
+  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
+  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
+  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+
+  //
+  // load transform -- uniform across subgroup
+  //
+  // v8: { sx shx tx shy sy ty w0 w1 }
+  //
+  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
+  //
+  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
+  //
+  // Coordinates are scaled to subpixel resolution.  All that matters
+  // is that continuity is maintained between end path element
+  // endpoints.
+  //
+  // It's the responsibility of the host to ensure that the transforms
+  // are properly scaled either via intitializing a transform stack
+  // with the subpixel resolution scaled identity or scaling the
+  // transform before its loaded by a rasterization grid.
+  //
+  // FIXME -- horizontal load might be better than this broadcast load
+  //
+  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
+  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
+  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
+
+  skc_rasterize_cubics(bp_atomics,
+                       bp_elems,
+                       bp_ids,
+                       bp_mask,
+                       cohort_atomics,
+                       sk_extent,
+                       smem,
+                       &nodeword,&id,
+                       &tv,&cv,cohort);
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                               __global union skc_bp_elem                * const bp_elems,
+                               __global uint                             * const bp_ids,
+                               skc_uint                                    const bp_mask,
+
+                               __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                               __global skc_ttsk_s_t                     * const sk_extent,
+
+                               __global float8                  const    * const transforms, // FIXME -- __constant
+                               __global float4                  const    * const clips,      // FIXME -- __constant
+                               __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                               skc_uint                                    const count)
+{
+  ;
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                                __global union skc_bp_elem                * const bp_elems,
+                                __global uint                             * const bp_ids,
+                                skc_uint                                    const bp_mask,
+
+                                __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                                __global skc_ttsk_s_t                     * const sk_extent,
+
+                                __global float8                  const    * const transforms, // FIXME -- __constant
+                                __global float4                  const    * const clips,      // FIXME -- __constant
+                                __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                                skc_uint                                    const count)
+{
+  ;
+}
+
+//
+//
+//