1 files changed, 871 insertions, 871 deletions
diff --git a/src/compute/skc/platforms/cl_12/kernels/place.cl b/src/compute/skc/platforms/cl_12/kernels/place.cl
index 92fa0a243d..8866bdb3e6 100644
--- a/src/compute/skc/platforms/cl_12/kernels/place.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/place.cl
@@ -1,871 +1,871 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "tile.h"
-#include "common.h"
-#include "raster.h"
-#include "atomic_cl.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define SKC_PLACE_SUBGROUP_MASK      (SKC_PLACE_SUBGROUP_SIZE - 1)
-#define SKC_PLACE_SUBGROUP_LAST      (SKC_PLACE_SUBGROUP_SIZE - 1)
-
-//
-//
-//
-
-#define SKC_PLACE_SMEM_COUNT_TTSK    SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_SMEM_COUNT_TTPK    SKC_RASTER_NODE_MAX_TTPK
-
-//
-//
-//
-
-#define SKC_PLACE_X                  (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE)
-
-//
-//
-//
-
-#if   ( SKC_PLACE_X == 1 )
-#define SKC_PLACE_EXPAND()           SKC_EXPAND_1()
-#define SKC_PLACE_EXPAND_I_LAST      0
-
-#elif ( SKC_PLACE_X == 2 )
-#define SKC_PLACE_EXPAND()           SKC_EXPAND_2()
-#define SKC_PLACE_EXPAND_I_LAST      1
-
-#elif ( SKC_PLACE_X == 4 )
-#define SKC_PLACE_EXPAND()           SKC_EXPAND_4()
-#define SKC_PLACE_EXPAND_I_LAST      3
-
-#elif ( SKC_PLACE_X == 8 )
-#define SKC_PLACE_EXPAND()           SKC_EXPAND_8()
-#define SKC_PLACE_EXPAND_I_LAST      7
-
-#elif ( SKC_PLACE_X == 16)
-#define SKC_PLACE_EXPAND()           SKC_EXPAND_16()
-#define SKC_PLACE_EXPAND_I_LAST      15
-#endif
-
-//
-// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE
-// COALESCED WRITES.  LO FIRST, FOLLOWED BY HI.
-//
-// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE
-// KERNELS USE DIFFERENT SUBGROUP SIZES.
-//
-// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE
-// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID.
-//
-// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER
-// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY
-// ONLY SUPPORT A SUBGROUP SIZE OF 16.
-//
-
-#if    ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE )
-
-#define SKC_PLACE_STRIDE_H(L)              (L)
-#define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE)
-
-#elif  ( SKC_PREFIX_SUBGROUP_SIZE >  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_PLACE_SUBGROUP_RATIO           (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_RATIO - 1)
-#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I)  ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK))
-
-#define SKC_PLACE_STRIDE_H(L)              (L)
-#define SKC_PLACE_STRIDE_V_LO(I)           (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE)
-
-#elif  ( SKC_PREFIX_SUBGROUP_SIZE <  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_PLACE_SUBGROUP_RATIO           (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
-#define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
-
-#define SKC_PLACE_STRIDE_H(L)              (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK))
-#define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO)
-
-#endif
-
-//
-// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE
-// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8)
-//
-
-#define SKC_PLACE_IS_ALL_HEADER_ROW(i)   (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS)
-
-#define SKC_PLACE_IS_NOT_HEADER_ROW(i)   ( (i)    * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS)
-
-#define SKC_PLACE_IS_TRAILING_ROW(i)     (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS)
-
-#define SKC_PLACE_IS_HEADER_ROW_KEY(i)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
-
-
-//
-// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX
-//
-#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
-#define SKC_PLACE_NODE_LESS_THAN(i,k)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id()                          < (k))
-
-//
-// TTSK v2:
-//
-//  0                                       63
-//  | TTSB ID | PREFIX |  SPAN   |  X  |  Y  |
-//  +---------+--------+---------+-----+-----+
-//  |    27   | 1 (=0) | 12 (=0) | 12  | 12  |
-//
-//
-// TTPK v2:
-//
-//  0                                    63
-//  | TTPB ID | PREFIX | SPAN |  X  |  Y  |
-//  +---------+--------+------+-----+-----+
-//  |    27   | 1 (=1) |  12  | 12  | 12  |
-//
-//
-
-//
-// TTCK (32-BIT COMPARE) v1:
-//
-//  0                                                           63
-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
-//  +----------------------+--------+--------+-------+-----+-----+
-//  |          30          |    1   |    1   |   18  |  7  |  7  |
-//
-//
-// TTCK (32-BIT COMPARE) v2:
-//
-//  0                                                           63
-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
-//  +----------------------+--------+--------+-------+-----+-----+
-//  |          30          |    1   |    1   |   15  |  9  |  8  |
-//
-//
-// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
-//
-//  0                                                           63
-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
-//  +----------------------+--------+--------+-------+-----+-----+
-//  |          27          |    1   |    1   |   18  |  9  |  8  |
-//
-
-union skc_subgroup_smem
-{
-  skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE
-
-  struct {
-    struct {
-      skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
-      skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
-    } lo;
-
-    struct {
-      skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
-      skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
-    } hi;
-
-    // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK];
-  };
-
-};
-
-//
-// scatter scan max
-//
-static
-skc_int_v_t
-skc_scatter_scan_max(__local union skc_subgroup_smem  volatile * const smem,
-                     skc_int_v_t                                 const iss,
-                     skc_int_v_t                                 const ess)
-{
-  //
-  // prefix sums determine which lanes we're going to work on next
-  //
-  skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE);
-  skc_int_v_t  const scratch_idx      = max(ess,0);
-
-  //
-  // SIMT
-  //
-
-  //
-  // zero the volatile smem scratchpad using vector syntax
-  //
-  smem->scratch[get_sub_group_local_id()] = ( 0 );
-
-  //
-  // store source lane at starting lane
-  //
-  if (is_scratch_store) {
-    smem->scratch[scratch_idx] = get_sub_group_local_id();
-  }
-
-  //
-  // propagate lanes to right using max scan
-  //
-  skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()];
-  skc_int_v_t const source  = sub_group_scan_inclusive_max(scratch);
-
-  return source;
-}
-
-//
-//
-//
-
-static
-skc_bool
-skc_xk_clip(union skc_tile_clip const * const tile_clip,
-            skc_ttxk_t                * const xk)
-{
-  //
-  // clip the sk and pk keys
-  //
-  // if fully clipped then return false
-  //
-  // alternatively -- we can expand all these keys in place
-  //
-  // alternatively -- keep sk and pk keys segregated because sk
-  // represents the vast majority of keys and are easier to process.
-  // don't mess with the fastpath!
-  //
-  return false;
-}
-
-//
-//
-//
-
-static
-skc_ttck_t
-skc_sk_to_ck(__local union skc_subgroup_smem  volatile * const smem,
-             union skc_cmd_place              const    * const cmd,
-             skc_uint                                    const sk_idx)
-{
-  skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0
-  skc_uint const hi = smem->hi.sk[sk_idx];
-
-  skc_ttck_t ck;
-
-  ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
-
-  // FIXME -- x and y should already be clipped and shifted
-  skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
-  skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
-
-  ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
-
-  return ck;
-}
-
-static
-skc_ttck_t
-skc_pk_to_ck(__local union skc_subgroup_smem  volatile * const smem,
-             union skc_cmd_place              const    * const cmd,
-             skc_uint                                    const pk_idx,
-             skc_uint                                    const dx)
-{
-  skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1
-  skc_uint const hi = smem->hi.pk[pk_idx];
-
-  skc_ttck_t ck;
-
-  ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
-
-  // FIXME -- x and y should already be clipped and shifted
-  skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
-  skc_uint const y = (cmd->ty +      SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
-
-  ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
-
-  return ck;
-}
-
-//
-//
-//
-
-static
-void
-skc_ttsk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,
-               __global skc_ttck_t                       * const ck_extent,
-               __local union skc_subgroup_smem  volatile * const smem,
-               union skc_cmd_place              const    * const cmd,
-               skc_uint                         const            sk)
-{
-  //
-  // Pretty sure you can never ever have an sk count equal to 0
-  //
-  skc_uint ck_base = 0;
-
-  // last lane performs the block pool allocation with an atomic increment
-  if (get_sub_group_local_id() == 0) {
-    ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk);
-  }
-
-  // broadcast base to all lanes
-  ck_base = sub_group_broadcast(ck_base,0);
-
-  // convert sk keys to ck keys
-  for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE)
-    {
-      ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii);
-    }
-}
-
-//
-//
-//
-
-static
-skc_int
-skc_ttpk_get_span(__local union skc_subgroup_smem  volatile * const smem,
-                  skc_uint                                    const idx)
-{
-  skc_uint const lo      = smem->lo.pk[idx];
-  skc_uint const hi      = smem->hi.pk[idx];
-
-  skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN;
-  skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN;
-
-  return (span_lo | span_hi) + 1;
-}
-
-//
-//
-//
-
-static
-void
-skc_ttpk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,
-               __global skc_ttck_t                       * const ck_extent,
-               __local union skc_subgroup_smem  volatile * const smem,
-               union skc_cmd_place              const    * const cmd,
-               skc_uint                         const            pk)
-{
-  // bail out if pk queue is empty
-  if (pk == 0)
-    return;
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("%u\n",pk);
-#endif
-
-  //
-  // FIXME -- this nested loop iterates over the queue processing a
-  // subgroup of 64-bit keys at a time.  This is probably not the most
-  // efficient approach so investigate how to store and iterate over a
-  // wider than subgroup (node-sized) queue of keys.
-  //
-
-  // round up so we work with full subgroups
-  skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK;
-  skc_uint       ii    = 0;
-
-  // nested loop that expands all ttpk keys
-#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE)
-  for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE)
-#endif
-    {
-      skc_uint idx  = ii + get_sub_group_local_id();
-      skc_int  span = 0;
-
-      // how many tiles does this ttpk span?
-      if (idx < pk)
-        span = skc_ttpk_get_span(smem,idx);
-
-      // we need inclusive, exclusive and total
-      skc_int iss = sub_group_scan_inclusive_add(span);
-      skc_int ess = iss - span;
-      skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1);
-
-      // printf("%u : %u\n",span,iss);
-      // continue;
-
-      // atomically allocate space for the pk keys
-      skc_uint ck_base = 0;
-
-      // last lane performs the block pool allocation with an atomic increment
-      if (get_sub_group_local_id() == 0) {
-        ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem);
-      }
-
-      // broadcast atomically allocated extent base to all lanes
-      skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id();
-
-      //
-      // FIXME -- this loop would probably be faster if the ttpk keys
-      // were held in registers and accessed with shuffles instead of
-      // SMEM loads
-      //
-
-      //
-      // loop until there are no more expanded pk keys
-      //
-      while (true)
-        {
-          skc_int const source = skc_scatter_scan_max(smem,iss,ess);
-          skc_int const dx     = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source);
-
-          // store valid ck keys to gmem
-          if (get_sub_group_local_id() < rem) {
-            ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx);
-          }
-
-          // decrement remainder
-          rem -= SKC_PLACE_SUBGROUP_SIZE;
-
-          if (rem <= 0)
-            break;
-
-          // increment/decrement indices
-          ck_idx += SKC_PLACE_SUBGROUP_SIZE;
-          iss    -= SKC_PLACE_SUBGROUP_SIZE;
-          ess    -= SKC_PLACE_SUBGROUP_SIZE;
-        }
-    }
-}
-
-//
-//
-//
-
-static
-skc_uint
-skc_ballot(skc_uint * const xk, skc_uint const is_xk)
-{
-#if 0
-  //
-  // FIXME -- when available, this should use the idiom:
-  //
-  //   ballot() + lane_mask_less_than_or_equal + popcount()
-  //
-  // Supported by:
-  //
-  //   - Vulkan 1.1 / SPIR-V 1.3
-  //   - CUDA
-  //   - AVX2 (SSE*?)
-  //
-#else
-  //
-  // otherwise, emulate with an inclusive scan (yuk)
-  //
-  skc_uint const prefix = sub_group_scan_inclusive_add(is_xk);
-
-  skc_uint const xk_idx = *xk + prefix - is_xk;
-
-  *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST);
-
-#if 0
-  printf("< %3u >\n",xk_idx);
-#endif
-
-  return xk_idx;
-#endif
-}
-
-//
-//
-//
-__kernel
-SKC_PLACE_KERNEL_ATTRIBS
-void
-skc_kernel_place(__global skc_bp_elem_t                * const bp_elems,
-                 __global SKC_ATOMIC_UINT     volatile * const place_atomics,
-                 __global skc_ttck_t                   * const ck_extent,
-                 __global union skc_cmd_place const    * const cmds,
-                 __global skc_block_id_t               * const map,
-                 skc_uint4                               const clip,
-                 skc_uint                                const count)
-{
-  //
-  // declare shared memory block
-  //
-#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
-  __local union skc_subgroup_smem  volatile                smem[1];
-#else
-  __local union skc_subgroup_smem  volatile                smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS];
-  __local union skc_subgroup_smem  volatile * const smem = smem_wg + get_sub_group_id();
-#endif
-
-  //
-  // This is a subgroup-centric kernel
-  //
-  // Which subgroup in the grid is this?
-  //
-  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
-  // get_group_id(0) as a uniform but the alternative calculation used
-  // when there are multiple subgroups per workgroup is not
-  // cooperating and driving spillage elsewhere.
-  //
-  // Test the raster's translated bounds against the composition's
-  // tile clip
-  //
-  // There are 3 cases:
-  //
-  //   - the raster is completely clipped -> return
-  //   - the raster is partially  clipped -> all keys must clipped
-  //   - the raster is not        clipped -> no keys are tested
-  //
-  //
-  // There are at least 4 implementations of place and we want to
-  // special-case them as much as possible so that, at the least, the
-  // fastpath remains fast.
-  //
-  //  - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP
-  //
-  //  - implement CLIPPED + NO TRANSLATION path
-  //
-  //  - implement NO CLIP +    TRANSLATION path
-  //
-  //  - implement CLIPPED +    TRANSLATION path
-  //
-  //
-  // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin
-  // 12:12:8 integer where:
-  //
-  //  12: ttsk
-  //  12: ttpk
-  //   8: /dev/null -- clipped or invalid key
-  //
-  // Three kinds of nodes in a raster's list:
-  //
-  //  - the head node
-  //  - an internal node
-  //  - the final node
-  //
-
-#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
-  skc_uint const cmd_idx = get_group_id(0);
-#else
-  skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
-  // load command
-  union skc_cmd_place const cmd = cmds[cmd_idx];
-
-  // get the raster header from the raster host id -- scalar
-  skc_block_id_t            id  = map[cmd.raster_h];
-
-  //
-  // load all of the head block ttxk keys into registers
-  //
-  // FIXME -- this pattern lends itself to using the higher
-  // performance Intel GEN block load instructions
-  //
-  skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                 \
-  union skc_raster_node_elem const h##I = {                     \
-    .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)],    \
-               bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)]  }  \
-  };
-
-  SKC_PLACE_EXPAND();
-
-  //
-  // load raster header counts -- we only need the "nodes" and "keys"
-  // words but the keys we loaded are doublewords.
-  //
-  // FIXME -- this can be made portable with compile-time macro expansion
-  //
-  skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
-  skc_uint keys  = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
-
-  //
-  //
-  //
-#if 0
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                 \
-  printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",             \
-         nodes,keys,                                            \
-         I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),  \
-         h##I.u32v2.hi,h##I.u32v2.lo,                           \
-         h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
-
-  SKC_PLACE_EXPAND();
-#endif
-
-  //
-#if 0
-  if (get_sub_group_local_id() == 0) {
-    printf("place: %u / %u / %u\n",head_id,nodes,keys);
-  }
-#endif
-
-  {
-    //
-    // classify every key in the header
-    //
-    // keys: 0 is not a key / 1 is a key
-    // skpk: 0 is sk        / 1 is pk
-    //
-    skc_uint bits_keys = 0;
-    skc_uint bits_skpk = 0;
-
-    //
-    // calculate bits_keys
-    //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \
-      skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \
-      if (idx < keys) {                                                 \
-        bits_keys |= (1u << I);                                         \
-      }                                                                 \
-      if (SKC_PLACE_IS_TRAILING_ROW(I)) {                               \
-        if (keys > SKC_RASTER_HEAD_COUNT_KEYS) {                        \
-          if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {    \
-            bits_keys &= ~(1u << I);                                    \
-          }                                                             \
-        }                                                               \
-      }                                                                 \
-    }
-
-    SKC_PLACE_EXPAND();
-
-    //
-    // blindly calculate bits_skpk
-    //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \
-      bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
-    }
-
-    SKC_PLACE_EXPAND();
-
-#if 0
-    printf("%2X : %2X\n",bits_keys,bits_skpk);
-#endif
-
-    //
-    // next pointer is last element of last row.  save it now because
-    // this might be recognized as a subgroup-uniform/scalar.
-    //
-    id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
-
-    //
-    // append SK keys first
-    //
-    skc_uint const bits_sk = bits_keys & ~bits_skpk;
-    skc_uint       sk      = 0;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                 \
-    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \
-      skc_uint is_sk  = (bits_sk >> I) & 1;     \
-      skc_uint sk_idx = skc_ballot(&sk,is_sk);  \
-      if (is_sk) {                              \
-        smem->lo.sk[sk_idx] = h##I.xk.lo;       \
-        smem->hi.sk[sk_idx] = h##I.xk.hi;       \
-      }                                         \
-    }
-
-    SKC_PLACE_EXPAND();
-
-    //
-    // append PK keys next
-    //
-    skc_uint const bits_pk = bits_keys & bits_skpk;
-    skc_uint       pk      = 0;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                 \
-    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \
-      skc_uint is_pk  = (bits_pk >> I) & 1;     \
-      skc_uint pk_idx = skc_ballot(&pk,is_pk);  \
-      if (is_pk) {                              \
-        smem->lo.pk[pk_idx] = h##I.xk.lo;       \
-        smem->hi.pk[pk_idx] = h##I.xk.hi;       \
-      }                                         \
-    }
-
-    SKC_PLACE_EXPAND();
-
-#if 0
-    printf("%2u * %2u\n",sk,pk);
-#endif
-    //
-    // flush the keys
-    //
-    skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
-    skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
-  }
-
-  //
-  // we're done if there was only a head node
-  //
-  if (nodes == 0)
-    return;
-
-  //
-  // decrement keys
-  //
-  keys -= SKC_RASTER_HEAD_COUNT_KEYS;
-
-  //
-  // otherwise, append keys in trailing nodes to smem
-  //
-  while (true)
-    {
-      //
-      // load all of the node block ttxk keys into registers
-      //
-      // FIXME -- this pattern lends itself to using the higher
-      // performance Intel GEN block load instructions
-      //
-      skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-      union skc_raster_node_elem const n##I = {                         \
-        .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)],        \
-                   bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)]  }      \
-      };
-
-      SKC_PLACE_EXPAND();
-
-#if 0
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-      printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",                 \
-             nodes,keys,                                                \
-             I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),      \
-             n##I.u32v2.hi,n##I.u32v2.lo,                               \
-             n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
-
-      SKC_PLACE_EXPAND();
-#endif
-
-      //
-      // classify every key in the header
-      //
-      // keys: 0 is not a key / 1 is a key
-      // skpk: 0 is sk        / 1 is pk
-      //
-      skc_uint bits_keys = 0;
-      skc_uint bits_skpk = 0;
-
-      //
-      // calculate bits_keys
-      //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
-        skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \
-        if (idx < keys) {                                               \
-          bits_keys |= (1u << I);                                       \
-        }                                                               \
-        if (SKC_PLACE_IS_TRAILING_ROW(I)) {                             \
-          if (keys > SKC_RASTER_NODE_COUNT_KEYS) {                      \
-            if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {  \
-              bits_keys &= ~(1u << I);                                  \
-            }                                                           \
-          }                                                             \
-        }                                                               \
-      }
-
-      SKC_PLACE_EXPAND();
-
-      //
-      // blindly calculate bits_skpk
-      //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
-        bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
-      }
-
-      SKC_PLACE_EXPAND();
-
-#if 0
-      printf("%2X : %2X\n",bits_keys,bits_skpk);
-#endif
-
-      //
-      // next pointer is last element of last row.  save it now because
-      // this might be recognized as a subgroup-uniform/scalar.
-      //
-      id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
-
-      //
-      // append SK keys first
-      //
-      skc_uint const bits_sk = bits_keys & ~bits_skpk;
-      skc_uint       sk      = 0;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) {                       \
-        skc_uint is_sk  = (bits_sk >> I) & 1;           \
-        skc_uint sk_idx = skc_ballot(&sk,is_sk);        \
-        if (is_sk) {                                    \
-          smem->lo.sk[sk_idx] = n##I.xk.lo;             \
-          smem->hi.sk[sk_idx] = n##I.xk.hi;             \
-        }                                               \
-      }
-
-      SKC_PLACE_EXPAND();
-
-      //
-      // append PK keys next
-      //
-      skc_uint const bits_pk = bits_keys & bits_skpk;
-      skc_uint       pk      = 0;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) {                       \
-        skc_uint is_pk  = (bits_pk >> I) & 1;           \
-        skc_uint pk_idx = skc_ballot(&pk,is_pk);        \
-        if (is_pk) {                                    \
-          smem->lo.pk[pk_idx] = n##I.xk.lo;             \
-          smem->hi.pk[pk_idx] = n##I.xk.hi;             \
-        }                                               \
-      }
-
-      SKC_PLACE_EXPAND();
-
-#if 0
-    printf("%2u * %2u\n",sk,pk);
-#endif
-      //
-      // if total for either the sk or pk queue reaches the
-      // highwater mark then flush it to the extent
-      //
-      skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
-      skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
-
-      //
-      // if this was the last node then we're done
-      //
-      if (--nodes == 0)
-        return;
-
-      //
-      // otherwise decrement keys
-      //
-      keys -= SKC_RASTER_NODE_COUNT_KEYS;
-    }
-}
-
-//
-//
-//
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "common.h"
+#include "raster.h"
+#include "atomic_cl.h"
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#define SKC_PLACE_SUBGROUP_MASK      (SKC_PLACE_SUBGROUP_SIZE - 1)
+#define SKC_PLACE_SUBGROUP_LAST      (SKC_PLACE_SUBGROUP_SIZE - 1)
+
+//
+//
+//
+
+#define SKC_PLACE_SMEM_COUNT_TTSK    SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_SMEM_COUNT_TTPK    SKC_RASTER_NODE_MAX_TTPK
+
+//
+//
+//
+
+#define SKC_PLACE_X                  (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE)
+
+//
+//
+//
+
+#if   ( SKC_PLACE_X == 1 )
+#define SKC_PLACE_EXPAND()           SKC_EXPAND_1()
+#define SKC_PLACE_EXPAND_I_LAST      0
+
+#elif ( SKC_PLACE_X == 2 )
+#define SKC_PLACE_EXPAND()           SKC_EXPAND_2()
+#define SKC_PLACE_EXPAND_I_LAST      1
+
+#elif ( SKC_PLACE_X == 4 )
+#define SKC_PLACE_EXPAND()           SKC_EXPAND_4()
+#define SKC_PLACE_EXPAND_I_LAST      3
+
+#elif ( SKC_PLACE_X == 8 )
+#define SKC_PLACE_EXPAND()           SKC_EXPAND_8()
+#define SKC_PLACE_EXPAND_I_LAST      7
+
+#elif ( SKC_PLACE_X == 16)
+#define SKC_PLACE_EXPAND()           SKC_EXPAND_16()
+#define SKC_PLACE_EXPAND_I_LAST      15
+#endif
+
+//
+// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE
+// COALESCED WRITES.  LO FIRST, FOLLOWED BY HI.
+//
+// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE
+// KERNELS USE DIFFERENT SUBGROUP SIZES.
+//
+// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE
+// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID.
+//
+// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER
+// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY
+// ONLY SUPPORT A SUBGROUP SIZE OF 16.
+//
+
+#if    ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE )
+
+#define SKC_PLACE_STRIDE_H(L)              (L)
+#define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE)
+
+#elif  ( SKC_PREFIX_SUBGROUP_SIZE >  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_PLACE_SUBGROUP_RATIO           (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_RATIO - 1)
+#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I)  ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK))
+
+#define SKC_PLACE_STRIDE_H(L)              (L)
+#define SKC_PLACE_STRIDE_V_LO(I)           (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE)
+
+#elif  ( SKC_PREFIX_SUBGROUP_SIZE <  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_PLACE_SUBGROUP_RATIO           (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
+#define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
+
+#define SKC_PLACE_STRIDE_H(L)              (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK))
+#define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO)
+
+#endif
+
+//
+// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE
+// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8)
+//
+
+#define SKC_PLACE_IS_ALL_HEADER_ROW(i)   (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS)
+
+#define SKC_PLACE_IS_NOT_HEADER_ROW(i)   ( (i)    * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS)
+
+#define SKC_PLACE_IS_TRAILING_ROW(i)     (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS)
+
+#define SKC_PLACE_IS_HEADER_ROW_KEY(i)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
+
+
+//
+// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX
+//
+#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
+#define SKC_PLACE_NODE_LESS_THAN(i,k)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id()                          < (k))
+
+//
+// TTSK v2:
+//
+//  0                                       63
+//  | TTSB ID | PREFIX |  SPAN   |  X  |  Y  |
+//  +---------+--------+---------+-----+-----+
+//  |    27   | 1 (=0) | 12 (=0) | 12  | 12  |
+//
+//
+// TTPK v2:
+//
+//  0                                    63
+//  | TTPB ID | PREFIX | SPAN |  X  |  Y  |
+//  +---------+--------+------+-----+-----+
+//  |    27   | 1 (=1) |  12  | 12  | 12  |
+//
+//
+
+//
+// TTCK (32-BIT COMPARE) v1:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          30          |    1   |    1   |   18  |  7  |  7  |
+//
+//
+// TTCK (32-BIT COMPARE) v2:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          30          |    1   |    1   |   15  |  9  |  8  |
+//
+//
+// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          27          |    1   |    1   |   18  |  9  |  8  |
+//
+
+union skc_subgroup_smem
+{
+  skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE
+
+  struct {
+    struct {
+      skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
+      skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
+    } lo;
+
+    struct {
+      skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
+      skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
+    } hi;
+
+    // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK];
+  };
+
+};
+
+//
+// scatter scan max
+//
+static
+skc_int_v_t
+skc_scatter_scan_max(__local union skc_subgroup_smem  volatile * const smem,
+                     skc_int_v_t                                 const iss,
+                     skc_int_v_t                                 const ess)
+{
+  //
+  // prefix sums determine which lanes we're going to work on next
+  //
+  skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE);
+  skc_int_v_t  const scratch_idx      = max(ess,0);
+
+  //
+  // SIMT
+  //
+
+  //
+  // zero the volatile smem scratchpad using vector syntax
+  //
+  smem->scratch[get_sub_group_local_id()] = ( 0 );
+
+  //
+  // store source lane at starting lane
+  //
+  if (is_scratch_store) {
+    smem->scratch[scratch_idx] = get_sub_group_local_id();
+  }
+
+  //
+  // propagate lanes to right using max scan
+  //
+  skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()];
+  skc_int_v_t const source  = sub_group_scan_inclusive_max(scratch);
+
+  return source;
+}
+
+//
+//
+//
+
+static
+skc_bool
+skc_xk_clip(union skc_tile_clip const * const tile_clip,
+            skc_ttxk_t                * const xk)
+{
+  //
+  // clip the sk and pk keys
+  //
+  // if fully clipped then return false
+  //
+  // alternatively -- we can expand all these keys in place
+  //
+  // alternatively -- keep sk and pk keys segregated because sk
+  // represents the vast majority of keys and are easier to process.
+  // don't mess with the fastpath!
+  //
+  return false;
+}
+
+//
+//
+//
+
+static
+skc_ttck_t
+skc_sk_to_ck(__local union skc_subgroup_smem  volatile * const smem,
+             union skc_cmd_place              const    * const cmd,
+             skc_uint                                    const sk_idx)
+{
+  skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0
+  skc_uint const hi = smem->hi.sk[sk_idx];
+
+  skc_ttck_t ck;
+
+  ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
+
+  // FIXME -- x and y should already be clipped and shifted
+  skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
+  skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
+
+  ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
+
+  return ck;
+}
+
+static
+skc_ttck_t
+skc_pk_to_ck(__local union skc_subgroup_smem  volatile * const smem,
+             union skc_cmd_place              const    * const cmd,
+             skc_uint                                    const pk_idx,
+             skc_uint                                    const dx)
+{
+  skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1
+  skc_uint const hi = smem->hi.pk[pk_idx];
+
+  skc_ttck_t ck;
+
+  ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
+
+  // FIXME -- x and y should already be clipped and shifted
+  skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
+  skc_uint const y = (cmd->ty +      SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
+
+  ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
+
+  return ck;
+}
+
+//
+//
+//
+
+static
+void
+skc_ttsk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,
+               __global skc_ttck_t                       * const ck_extent,
+               __local union skc_subgroup_smem  volatile * const smem,
+               union skc_cmd_place              const    * const cmd,
+               skc_uint                         const            sk)
+{
+  //
+  // Pretty sure you can never ever have an sk count equal to 0
+  //
+  skc_uint ck_base = 0;
+
+  // last lane performs the block pool allocation with an atomic increment
+  if (get_sub_group_local_id() == 0) {
+    ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk);
+  }
+
+  // broadcast base to all lanes
+  ck_base = sub_group_broadcast(ck_base,0);
+
+  // convert sk keys to ck keys
+  for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE)
+    {
+      ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii);
+    }
+}
+
+//
+//
+//
+
+static
+skc_int
+skc_ttpk_get_span(__local union skc_subgroup_smem  volatile * const smem,
+                  skc_uint                                    const idx)
+{
+  skc_uint const lo      = smem->lo.pk[idx];
+  skc_uint const hi      = smem->hi.pk[idx];
+
+  skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN;
+  skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN;
+
+  return (span_lo | span_hi) + 1;
+}
+
+//
+//
+//
+
+static
+void
+skc_ttpk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,
+               __global skc_ttck_t                       * const ck_extent,
+               __local union skc_subgroup_smem  volatile * const smem,
+               union skc_cmd_place              const    * const cmd,
+               skc_uint                         const            pk)
+{
+  // bail out if pk queue is empty
+  if (pk == 0)
+    return;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("%u\n",pk);
+#endif
+
+  //
+  // FIXME -- this nested loop iterates over the queue processing a
+  // subgroup of 64-bit keys at a time.  This is probably not the most
+  // efficient approach so investigate how to store and iterate over a
+  // wider than subgroup (node-sized) queue of keys.
+  //
+
+  // round up so we work with full subgroups
+  skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK;
+  skc_uint       ii    = 0;
+
+  // nested loop that expands all ttpk keys
+#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE)
+  for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE)
+#endif
+    {
+      skc_uint idx  = ii + get_sub_group_local_id();
+      skc_int  span = 0;
+
+      // how many tiles does this ttpk span?
+      if (idx < pk)
+        span = skc_ttpk_get_span(smem,idx);
+
+      // we need inclusive, exclusive and total
+      skc_int iss = sub_group_scan_inclusive_add(span);
+      skc_int ess = iss - span;
+      skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1);
+
+      // printf("%u : %u\n",span,iss);
+      // continue;
+
+      // atomically allocate space for the pk keys
+      skc_uint ck_base = 0;
+
+      // last lane performs the block pool allocation with an atomic increment
+      if (get_sub_group_local_id() == 0) {
+        ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem);
+      }
+
+      // broadcast atomically allocated extent base to all lanes
+      skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id();
+
+      //
+      // FIXME -- this loop would probably be faster if the ttpk keys
+      // were held in registers and accessed with shuffles instead of
+      // SMEM loads
+      //
+
+      //
+      // loop until there are no more expanded pk keys
+      //
+      while (true)
+        {
+          skc_int const source = skc_scatter_scan_max(smem,iss,ess);
+          skc_int const dx     = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source);
+
+          // store valid ck keys to gmem
+          if (get_sub_group_local_id() < rem) {
+            ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx);
+          }
+
+          // decrement remainder
+          rem -= SKC_PLACE_SUBGROUP_SIZE;
+
+          if (rem <= 0)
+            break;
+
+          // increment/decrement indices
+          ck_idx += SKC_PLACE_SUBGROUP_SIZE;
+          iss    -= SKC_PLACE_SUBGROUP_SIZE;
+          ess    -= SKC_PLACE_SUBGROUP_SIZE;
+        }
+    }
+}
+
+//
+//
+//
+
+static
+skc_uint
+skc_ballot(skc_uint * const xk, skc_uint const is_xk)
+{
+#if 0
+  //
+  // FIXME -- when available, this should use the idiom:
+  //
+  //   ballot() + lane_mask_less_than_or_equal + popcount()
+  //
+  // Supported by:
+  //
+  //   - Vulkan 1.1 / SPIR-V 1.3
+  //   - CUDA
+  //   - AVX2 (SSE*?)
+  //
+#else
+  //
+  // otherwise, emulate with an inclusive scan (yuk)
+  //
+  skc_uint const prefix = sub_group_scan_inclusive_add(is_xk);
+
+  skc_uint const xk_idx = *xk + prefix - is_xk;
+
+  *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST);
+
+#if 0
+  printf("< %3u >\n",xk_idx);
+#endif
+
+  return xk_idx;
+#endif
+}
+
+//
+//
+//
+__kernel
+SKC_PLACE_KERNEL_ATTRIBS
+void
+skc_kernel_place(__global skc_bp_elem_t                * const bp_elems,
+                 __global SKC_ATOMIC_UINT     volatile * const place_atomics,
+                 __global skc_ttck_t                   * const ck_extent,
+                 __global union skc_cmd_place const    * const cmds,
+                 __global skc_block_id_t               * const map,
+                 skc_uint4                               const clip,
+                 skc_uint                                const count)
+{
+  //
+  // declare shared memory block
+  //
+#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
+  __local union skc_subgroup_smem  volatile                smem[1];
+#else
+  __local union skc_subgroup_smem  volatile                smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS];
+  __local union skc_subgroup_smem  volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+
+  //
+  // This is a subgroup-centric kernel
+  //
+  // Which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+  // get_group_id(0) as a uniform but the alternative calculation used
+  // when there are multiple subgroups per workgroup is not
+  // cooperating and driving spillage elsewhere.
+  //
+  // Test the raster's translated bounds against the composition's
+  // tile clip
+  //
+  // There are 3 cases:
+  //
+  //   - the raster is completely clipped -> return
+  //   - the raster is partially  clipped -> all keys must clipped
+  //   - the raster is not        clipped -> no keys are tested
+  //
+  //
+  // There are at least 4 implementations of place and we want to
+  // special-case them as much as possible so that, at the least, the
+  // fastpath remains fast.
+  //
+  //  - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP
+  //
+  //  - implement CLIPPED + NO TRANSLATION path
+  //
+  //  - implement NO CLIP +    TRANSLATION path
+  //
+  //  - implement CLIPPED +    TRANSLATION path
+  //
+  //
+  // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin
+  // 12:12:8 integer where:
+  //
+  //  12: ttsk
+  //  12: ttpk
+  //   8: /dev/null -- clipped or invalid key
+  //
+  // Three kinds of nodes in a raster's list:
+  //
+  //  - the head node
+  //  - an internal node
+  //  - the final node
+  //
+
+#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
+  skc_uint const cmd_idx = get_group_id(0);
+#else
+  skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+  // load command
+  union skc_cmd_place const cmd = cmds[cmd_idx];
+
+  // get the raster header from the raster host id -- scalar
+  skc_block_id_t            id  = map[cmd.raster_h];
+
+  //
+  // load all of the head block ttxk keys into registers
+  //
+  // FIXME -- this pattern lends itself to using the higher
+  // performance Intel GEN block load instructions
+  //
+  skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                 \
+  union skc_raster_node_elem const h##I = {                     \
+    .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)],    \
+               bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)]  }  \
+  };
+
+  SKC_PLACE_EXPAND();
+
+  //
+  // load raster header counts -- we only need the "nodes" and "keys"
+  // words but the keys we loaded are doublewords.
+  //
+  // FIXME -- this can be made portable with compile-time macro expansion
+  //
+  skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
+  skc_uint keys  = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
+
+  //
+  //
+  //
+#if 0
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                 \
+  printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",             \
+         nodes,keys,                                            \
+         I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),  \
+         h##I.u32v2.hi,h##I.u32v2.lo,                           \
+         h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
+
+  SKC_PLACE_EXPAND();
+#endif
+
+  //
+#if 0
+  if (get_sub_group_local_id() == 0) {
+    printf("place: %u / %u / %u\n",head_id,nodes,keys);
+  }
+#endif
+
+  {
+    //
+    // classify every key in the header
+    //
+    // keys: 0 is not a key / 1 is a key
+    // skpk: 0 is sk        / 1 is pk
+    //
+    skc_uint bits_keys = 0;
+    skc_uint bits_skpk = 0;
+
+    //
+    // calculate bits_keys
+    //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \
+      skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \
+      if (idx < keys) {                                                 \
+        bits_keys |= (1u << I);                                         \
+      }                                                                 \
+      if (SKC_PLACE_IS_TRAILING_ROW(I)) {                               \
+        if (keys > SKC_RASTER_HEAD_COUNT_KEYS) {                        \
+          if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {    \
+            bits_keys &= ~(1u << I);                                    \
+          }                                                             \
+        }                                                               \
+      }                                                                 \
+    }
+
+    SKC_PLACE_EXPAND();
+
+    //
+    // blindly calculate bits_skpk
+    //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \
+      bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
+    }
+
+    SKC_PLACE_EXPAND();
+
+#if 0
+    printf("%2X : %2X\n",bits_keys,bits_skpk);
+#endif
+
+    //
+    // next pointer is last element of last row.  save it now because
+    // this might be recognized as a subgroup-uniform/scalar.
+    //
+    id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
+
+    //
+    // append SK keys first
+    //
+    skc_uint const bits_sk = bits_keys & ~bits_skpk;
+    skc_uint       sk      = 0;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                 \
+    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \
+      skc_uint is_sk  = (bits_sk >> I) & 1;     \
+      skc_uint sk_idx = skc_ballot(&sk,is_sk);  \
+      if (is_sk) {                              \
+        smem->lo.sk[sk_idx] = h##I.xk.lo;       \
+        smem->hi.sk[sk_idx] = h##I.xk.hi;       \
+      }                                         \
+    }
+
+    SKC_PLACE_EXPAND();
+
+    //
+    // append PK keys next
+    //
+    skc_uint const bits_pk = bits_keys & bits_skpk;
+    skc_uint       pk      = 0;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                 \
+    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \
+      skc_uint is_pk  = (bits_pk >> I) & 1;     \
+      skc_uint pk_idx = skc_ballot(&pk,is_pk);  \
+      if (is_pk) {                              \
+        smem->lo.pk[pk_idx] = h##I.xk.lo;       \
+        smem->hi.pk[pk_idx] = h##I.xk.hi;       \
+      }                                         \
+    }
+
+    SKC_PLACE_EXPAND();
+
+#if 0
+    printf("%2u * %2u\n",sk,pk);
+#endif
+    //
+    // flush the keys
+    //
+    skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
+    skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
+  }
+
+  //
+  // we're done if there was only a head node
+  //
+  if (nodes == 0)
+    return;
+
+  //
+  // decrement keys
+  //
+  keys -= SKC_RASTER_HEAD_COUNT_KEYS;
+
+  //
+  // otherwise, append keys in trailing nodes to smem
+  //
+  while (true)
+    {
+      //
+      // load all of the node block ttxk keys into registers
+      //
+      // FIXME -- this pattern lends itself to using the higher
+      // performance Intel GEN block load instructions
+      //
+      skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      union skc_raster_node_elem const n##I = {                         \
+        .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)],        \
+                   bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)]  }      \
+      };
+
+      SKC_PLACE_EXPAND();
+
+#if 0
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",                 \
+             nodes,keys,                                                \
+             I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),      \
+             n##I.u32v2.hi,n##I.u32v2.lo,                               \
+             n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
+
+      SKC_PLACE_EXPAND();
+#endif
+
+      //
+      // classify every key in the header
+      //
+      // keys: 0 is not a key / 1 is a key
+      // skpk: 0 is sk        / 1 is pk
+      //
+      skc_uint bits_keys = 0;
+      skc_uint bits_skpk = 0;
+
+      //
+      // calculate bits_keys
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
+        skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \
+        if (idx < keys) {                                               \
+          bits_keys |= (1u << I);                                       \
+        }                                                               \
+        if (SKC_PLACE_IS_TRAILING_ROW(I)) {                             \
+          if (keys > SKC_RASTER_NODE_COUNT_KEYS) {                      \
+            if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {  \
+              bits_keys &= ~(1u << I);                                  \
+            }                                                           \
+          }                                                             \
+        }                                                               \
+      }
+
+      SKC_PLACE_EXPAND();
+
+      //
+      // blindly calculate bits_skpk
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
+        bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
+      }
+
+      SKC_PLACE_EXPAND();
+
+#if 0
+      printf("%2X : %2X\n",bits_keys,bits_skpk);
+#endif
+
+      //
+      // next pointer is last element of last row.  save it now because
+      // this might be recognized as a subgroup-uniform/scalar.
+      //
+      id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
+
+      //
+      // append SK keys first
+      //
+      skc_uint const bits_sk = bits_keys & ~bits_skpk;
+      skc_uint       sk      = 0;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                       \
+        skc_uint is_sk  = (bits_sk >> I) & 1;           \
+        skc_uint sk_idx = skc_ballot(&sk,is_sk);        \
+        if (is_sk) {                                    \
+          smem->lo.sk[sk_idx] = n##I.xk.lo;             \
+          smem->hi.sk[sk_idx] = n##I.xk.hi;             \
+        }                                               \
+      }
+
+      SKC_PLACE_EXPAND();
+
+      //
+      // append PK keys next
+      //
+      skc_uint const bits_pk = bits_keys & bits_skpk;
+      skc_uint       pk      = 0;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                       \
+        skc_uint is_pk  = (bits_pk >> I) & 1;           \
+        skc_uint pk_idx = skc_ballot(&pk,is_pk);        \
+        if (is_pk) {                                    \
+          smem->lo.pk[pk_idx] = n##I.xk.lo;             \
+          smem->hi.pk[pk_idx] = n##I.xk.hi;             \
+        }                                               \
+      }
+
+      SKC_PLACE_EXPAND();
+
+#if 0
+    printf("%2u * %2u\n",sk,pk);
+#endif
+      //
+      // if total for either the sk or pk queue reaches the
+      // highwater mark then flush it to the extent
+      //
+      skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
+      skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
+
+      //
+      // if this was the last node then we're done
+      //
+      if (--nodes == 0)
+        return;
+
+      //
+      // otherwise decrement keys
+      //
+      keys -= SKC_RASTER_NODE_COUNT_KEYS;
+    }
+}
+
+//
+//
+//