1 files changed, 543 insertions, 543 deletions
diff --git a/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl b/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl
index 302ea14af2..63a1a43177 100644
--- a/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl
@@ -1,543 +1,543 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "path.h"
-#include "block_pool_cl.h"
-#include "path_builder_cl_12.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#if 0
-
-//
-// SIMD AVX2
-//
-
-#define SKC_PATHS_COPY_WORDS_PER_ELEM          8
-#define SKC_PATHS_COPY_SUBGROUP_SIZE           1
-#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES
-
-typedef skc_uint8  skc_paths_copy_elem;
-typedef skc_uint8  skc_pb_idx_v;
-
-#define SKC_PATHS_COPY_ELEM_EXPAND()           SKC_EXPAND_8()
-
-#define SKC_IS_NOT_PATH_HEAD(sg,I)             ((sg) + I >= SKC_PATH_HEAD_WORDS)
-
-#endif
-
-//
-//
-//
-
-#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK      (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)
-#define SKC_PATHS_COPY_ELEMS_PER_BLOCK         (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
-#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK      (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
-#define SKC_PATHS_COPY_ELEMS_PER_THREAD        (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)
-
-// FIXME -- use SUBGROUP terminology everywhere
-#define SKC_PATHS_COPY_SUBGROUP_WORDS          (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)
-
-//
-//
-//
-
-#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER                              \
-  (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))
-
-#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER                           \
-  (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))
-
-// #define SKC_PATHS_COPY_HEAD_ELEMS    ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)
-
-//
-//
-//
-
-//
-// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL
-//
-
-#define SKC_CMD_PATHS_COPY_ONE_BITS              (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
-
-#define SKC_CMD_PATHS_COPY_ONE_MASK              SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)
-
-#define SKC_CMD_PATHS_COPY_ONE                   (1u << SKC_CMD_PATHS_COPY_ONE_BITS)
-
-#define SKC_CMD_PATHS_COPY_GET_TAG(ti)           SKC_TAGGED_BLOCK_ID_GET_TAG(ti)
-
-#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti)       ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)
-
-#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b)  (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))
-
-//
-//
-//
-
-skc_uint
-skc_sub_group_local_id()
-{
-#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
-  return get_sub_group_local_id();
-#else
-  return 0;
-#endif
-}
-
-//
-// convert an atomic read counter offset to a block id
-//
-
-skc_block_id_t
-skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,
-                 skc_uint                        const bp_idx_mask,
-                 skc_uint                        const bp_reads,
-                 skc_uint                        const bp_off)
-{
-  skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;
-
-  return bp_ids[bp_idx];
-}
-
-//
-//
-//
-
-void
-skc_copy_segs(__global skc_paths_copy_elem       * const bp_elems, // to
-              skc_uint                             const bp_elems_idx,
-              __global skc_paths_copy_elem const * const pb_elems, // from
-              skc_uint                             const pb_elems_idx)
-{
-  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
-    {
-      (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];
-    }
-
-#if 0
-  //
-  // NOTE THIS IS PRINTING 8 ROWS
-  //
-  printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
-         (skc_uint)get_global_id(0),pb_elems_idx,
-         as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),
-         as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),
-         as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),
-         as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));
-  printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
-         (skc_uint)get_global_id(0),pb_elems_idx,
-         as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),
-         as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),
-         as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),
-         as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));
-#endif
-}
-
-//
-//
-//
-
-void
-skc_copy_node(__global skc_paths_copy_elem       * const bp_elems, // to
-              skc_uint                             const bp_elems_idx,
-              __global skc_block_id_t      const * const bp_ids,
-              skc_uint                             const bp_reads,
-              skc_uint                             const bp_idx_mask,
-              __global skc_paths_copy_elem const * const pb_elems, // from
-              skc_uint                             const pb_elems_idx,
-              skc_uint                             const pb_rolling)
-{
-  //
-  // remap block id tags bp_elems the host-side rolling counter pb_elems a
-  // device-side block pool id
-  //
-  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
-    {
-      // load block_id_tag words
-      skc_paths_copy_elem elem   = (pb_elems + pb_elems_idx)[ii];
-
-      // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
-      skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
-
-      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
-
-      //
-      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
-      // will _always_ be safe as long as we don't use the loaded
-      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
-      // of iterating over the vector components.
-      //
-
-      // only convert if original elem is not invalid
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                 \
-      if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \
-        skc_block_id_t const b = bp_ids[bp_idx C];              \
-        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \
-      }
-
-      // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);
-      
-      SKC_PATHS_COPY_ELEM_EXPAND();
-
-      // store the elem back
-      (bp_elems+bp_elems_idx)[ii] = elem;
-    }
-}
-
-//
-//
-//
-
-void
-skc_host_map_update(__global skc_uint * const host_map,
-                    skc_uint            const block,
-                    skc_paths_copy_elem const elem)
-{
-  //
-  // write first elem to map -- FIXME -- this is a little nasty
-  // because it relies on the the host handle always being the first
-  // word in the path header.
-  //
-  // OTOH, this is not unreasonable.  The alternative is to have a
-  // separate kernel initializing the map.
-  //
-#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
-  if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)
-#endif
-    {
-#if SKC_PATHS_COPY_ELEM_WORDS == 1
-      host_map[elem] = block; 
-#if 0
-      printf("[%u] = %u\n",elem,block);
-#endif
-#else
-      host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;
-#endif
-    }
-}
-
-//
-//
-//
-
-void
-skc_copy_head(__global skc_uint                  * const host_map,
-              skc_uint                             const block,
-              __global skc_paths_copy_elem       * const bp_elems, // to
-              skc_uint                             const bp_elems_idx,
-              __global skc_block_id_t      const * const bp_ids,
-              skc_uint                             const bp_reads,
-              skc_uint                             const bp_idx_mask,
-              __global skc_paths_copy_elem const * const pb_elems, // from
-              skc_uint                             const pb_elems_idx,
-              skc_uint                             const pb_rolling)
-{
-  //
-  // if there are more path header words than there are
-  // threads-per-block then we can just copy the initial header words
-  //
-#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )
-  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
-    {
-      skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];
-
-      (bp_elems+bp_elems_idx)[ii] = elem;
-
-      if (ii == 0) {
-        skc_host_map_update(host_map,block,elem);
-      }
-    }
-#endif
-
-  //
-  // this is similar to copy node but the first H words of the path
-  // header are not modified and simply copied
-  //
-  for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
-    {
-      skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
-
-#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )
-      if (ii == 0) {
-        skc_host_map_update(host_map,block,elem);
-      }
-#endif
-      // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
-      skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
-
-      //
-      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
-      // will _always_ be safe as long as we don't use the loaded
-      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
-      // of iterating over the vector components.
-      //
-
-      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
-
-      // FIXME -- MIX MIX MIX MIX / SELECT
-
-      // only convert if original elem is not invalid
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-      if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \
-        skc_block_id_t const b = bp_ids[bp_idx C];                      \
-        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);           \
-      }
-
-      // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);
-
-      SKC_PATHS_COPY_ELEM_EXPAND();
-
-      // store the elem back
-      (bp_elems+bp_elems_idx)[ii] = elem;
-    }
-
-  //
-  // the remaining words are treated like a node
-  //
-  for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
-    {
-      // load block_id_tag words
-      skc_paths_copy_elem elem   = (pb_elems+pb_elems_idx)[ii];
-
-      // calculate ahead of time
-      skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
-
-      //
-      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
-      // will _always_ be safe as long as we don't use the loaded
-      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
-      // of iterating over the vector components.
-      //
-
-      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
-
-      // only convert if original elem is not invalid
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                 \
-      if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \
-        skc_block_id_t const b = bp_ids[bp_idx C];              \
-        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \
-      }
-
-      // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);
-
-      SKC_PATHS_COPY_ELEM_EXPAND();
-
-      // store the elem
-      (bp_elems+bp_elems_idx)[ii] = elem;
-    }
-}
-
-//
-// FIXME -- pack some of these constant integer args in a vec or struct
-//
-
-__kernel
-SKC_PATHS_COPY_KERNEL_ATTRIBS
-void
-skc_kernel_paths_copy
-(__global skc_uint                        * const host_map,
-
- __global skc_block_id_t            const * const bp_ids,
- __global skc_paths_copy_elem             * const bp_elems,
- skc_uint                                   const bp_idx_mask, // pow2 modulo mask for block pool ring
-
- __global skc_uint                  const * const bp_alloc,    // block pool ring base
- skc_uint                                   const bp_alloc_idx,// which subbuf
-
- __global union skc_tagged_block_id const * const pb_cmds,
- __global skc_paths_copy_elem       const * const pb_elems,
-
- skc_uint                                   const pb_size,     // # of commands/blocks in buffer
- skc_uint                                   const pb_rolling,  // shifted rolling counter base
-
- skc_uint                                   const pb_prev_from,
- skc_uint                                   const pb_prev_span,
- skc_uint                                   const pb_curr_from)
-{
-  //
-  // THERE ARE 3 TYPES OF PATH COPYING COMMANDS:
-  //
-  // - HEAD
-  // - NODE
-  // - SEGS
-  //
-  // THESE ARE SUBGROUP ORIENTED KERNELS
-  //
-  // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS
-  //
-
-  //
-  // It's likely that peak bandwidth is achievable with a single
-  // workgroup.
-  //
-  // So let's keep the grids modestly sized and for simplicity and
-  // portability, let's assume that a single workgroup can perform all
-  // steps in the copy.
-  //
-  // Launch as large of a workgroup as possiblex
-  //
-  // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL
-  // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS
-  // 3. FOR EACH COMMAND:
-  //      - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.
-  //      - NODE: CONVERT AND COPY B INDICES
-  //      - SEGS: BULK COPY
-  //
-  // B : number of words in block -- always pow2
-  // W : intelligently/arbitrarily chosen factor of B -- always pow2
-  //
-
-  //
-  // There are several approaches to processing the commands:
-  //
-  // 1. B threads are responsible for one block. All threads broadcast
-  //    load a single command word. Workgroup size must be a facpb_elemsr of
-  //    B.
-  //
-  // 2. W threads process an entire block. W will typically be the
-  //    device's subgroup/warp/wave width. W threads broadcast load a
-  //    single command word.
-  //
-  // 3. W threads process W blocks. W threads load W command words and
-  //    process W blocks.
-  //
-  // Clearly (1) has low I/O intensity but will achieve high
-  // parallelism by activating the most possible threads. The downside
-  // of this kind of approach is that the kernel will occupy even a
-  // large GPU with low intensity work and reduce opportunities for
-  // concurrent kernel execution (of other kernels).
-  //
-  // See Vasily Volkov's CUDA presentation describing these tradeoffs.
-  //
-  // Note that there are many other approaches.  For example, similar
-  // pb_elems (1) but each thread loads a pow2 vector of block data.
-  //
-
-  // load the copied atomic read "base" from gmem
-  skc_uint const bp_reads = bp_alloc[bp_alloc_idx];
-  // will always be less than 2^32
-  skc_uint const gid      = get_global_id(0);
-  // every subgroup/simd that will work on the block loads the same command
-  skc_uint const sg_idx   = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;
-  // path builder data can be spread across two spans
-  skc_uint       pb_idx   = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);
-
-  // no need pb_elems make this branchless
-  if (pb_idx >= pb_size)
-    pb_idx -= pb_size;
-
-  // broadcast load the command
-  union skc_tagged_block_id const pb_cmd       = pb_cmds[pb_idx];
-
-  // what do we want pb_elems do with this block?
-  skc_cmd_paths_copy_tag    const tag          = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);
-
-  // compute offset from rolling base to get index into block pool ring allocation
-  skc_uint                  const bp_off       = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);
-
-  // convert the pb_cmd's offset counter pb_elems a block id
-  skc_block_id_t            const block        = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);
-
-#if 0
-  if (get_sub_group_local_id() == 0) {
-    printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);
-    printf("< %8u >\n",block);
-  }
-#endif
-
-  // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()
-  skc_uint                 const tid          = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;
-
-  // calculate bp_elems (to) / pb_elems (from)
-  skc_uint                 const bp_elems_idx = block  * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;
-  skc_uint                 const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK    + tid;
-
-  if      (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)
-    {
-#if 0
-      if (tid == 0)
-        printf("%3u, segs\n",bp_off);
-#endif
-      skc_copy_segs(bp_elems,
-                    bp_elems_idx,
-                    pb_elems,
-                    pb_elems_idx);
-    }
-  else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)
-    {
-#if 0
-      if (tid == 0)
-        printf("%3u, NODE\n",bp_off);
-#endif
-      skc_copy_node(bp_elems, // to
-                    bp_elems_idx,
-                    bp_ids,
-                    bp_reads,
-                    bp_idx_mask,
-                    pb_elems, // from
-                    pb_elems_idx,
-                    pb_rolling);
-    }
-  else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)
-    {
-#if 0
-      if (tid == 0)
-        printf("%3u, HEAD\n",bp_off);
-#endif
-      skc_copy_head(host_map,
-                    block,
-                    bp_elems, // to
-                    bp_elems_idx,
-                    bp_ids,
-                    bp_reads,
-                    bp_idx_mask,
-                    pb_elems, // from
-                    pb_elems_idx,
-                    pb_rolling);
-    }
-}
-
-//
-//
-//
-
-__kernel
-SKC_PATHS_ALLOC_KERNEL_ATTRIBS
-void
-skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,
-                       __global skc_uint          * const bp_alloc,
-                       skc_uint                     const bp_alloc_idx,
-                       skc_uint                     const pb_cmd_count)
-{
-  //
-  // allocate blocks in block pool
-  //
-  skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);
-
-  // store in slot
-  bp_alloc[bp_alloc_idx] = reads;
-
-#if 0
-  printf("pc: %8u + %u\n",reads,pb_cmd_count);
-#endif
-}
-
-//
-//
-//
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "path.h"
+#include "block_pool_cl.h"
+#include "path_builder_cl_12.h"
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#if 0
+
+//
+// SIMD AVX2
+//
+
+#define SKC_PATHS_COPY_WORDS_PER_ELEM          8
+#define SKC_PATHS_COPY_SUBGROUP_SIZE           1
+#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES
+
+typedef skc_uint8  skc_paths_copy_elem;
+typedef skc_uint8  skc_pb_idx_v;
+
+#define SKC_PATHS_COPY_ELEM_EXPAND()           SKC_EXPAND_8()
+
+#define SKC_IS_NOT_PATH_HEAD(sg,I)             ((sg) + I >= SKC_PATH_HEAD_WORDS)
+
+#endif
+
+//
+//
+//
+
+#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK      (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)
+#define SKC_PATHS_COPY_ELEMS_PER_BLOCK         (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
+#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK      (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
+#define SKC_PATHS_COPY_ELEMS_PER_THREAD        (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)
+
+// FIXME -- use SUBGROUP terminology everywhere
+#define SKC_PATHS_COPY_SUBGROUP_WORDS          (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)
+
+//
+//
+//
+
+#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER                              \
+  (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))
+
+#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER                           \
+  (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))
+
+// #define SKC_PATHS_COPY_HEAD_ELEMS    ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)
+
+//
+//
+//
+
+//
+// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL
+//
+
+#define SKC_CMD_PATHS_COPY_ONE_BITS              (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
+
+#define SKC_CMD_PATHS_COPY_ONE_MASK              SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)
+
+#define SKC_CMD_PATHS_COPY_ONE                   (1u << SKC_CMD_PATHS_COPY_ONE_BITS)
+
+#define SKC_CMD_PATHS_COPY_GET_TAG(ti)           SKC_TAGGED_BLOCK_ID_GET_TAG(ti)
+
+#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti)       ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)
+
+#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b)  (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))
+
+//
+//
+//
+
+skc_uint
+skc_sub_group_local_id()
+{
+#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
+  return get_sub_group_local_id();
+#else
+  return 0;
+#endif
+}
+
+//
+// convert an atomic read counter offset to a block id
+//
+
+skc_block_id_t
+skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,
+                 skc_uint                        const bp_idx_mask,
+                 skc_uint                        const bp_reads,
+                 skc_uint                        const bp_off)
+{
+  skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;
+
+  return bp_ids[bp_idx];
+}
+
+//
+//
+//
+
+void
+skc_copy_segs(__global skc_paths_copy_elem       * const bp_elems, // to
+              skc_uint                             const bp_elems_idx,
+              __global skc_paths_copy_elem const * const pb_elems, // from
+              skc_uint                             const pb_elems_idx)
+{
+  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+    {
+      (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];
+    }
+
+#if 0
+  //
+  // NOTE THIS IS PRINTING 8 ROWS
+  //
+  printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
+         (skc_uint)get_global_id(0),pb_elems_idx,
+         as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));
+  printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
+         (skc_uint)get_global_id(0),pb_elems_idx,
+         as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));
+#endif
+}
+
+//
+//
+//
+
+void
+skc_copy_node(__global skc_paths_copy_elem       * const bp_elems, // to
+              skc_uint                             const bp_elems_idx,
+              __global skc_block_id_t      const * const bp_ids,
+              skc_uint                             const bp_reads,
+              skc_uint                             const bp_idx_mask,
+              __global skc_paths_copy_elem const * const pb_elems, // from
+              skc_uint                             const pb_elems_idx,
+              skc_uint                             const pb_rolling)
+{
+  //
+  // remap block id tags bp_elems the host-side rolling counter pb_elems a
+  // device-side block pool id
+  //
+  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+    {
+      // load block_id_tag words
+      skc_paths_copy_elem elem   = (pb_elems + pb_elems_idx)[ii];
+
+      // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
+      skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
+
+      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
+
+      //
+      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
+      // will _always_ be safe as long as we don't use the loaded
+      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
+      // of iterating over the vector components.
+      //
+
+      // only convert if original elem is not invalid
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                 \
+      if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \
+        skc_block_id_t const b = bp_ids[bp_idx C];              \
+        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \
+      }
+
+      // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);
+      
+      SKC_PATHS_COPY_ELEM_EXPAND();
+
+      // store the elem back
+      (bp_elems+bp_elems_idx)[ii] = elem;
+    }
+}
+
+//
+//
+//
+
+void
+skc_host_map_update(__global skc_uint * const host_map,
+                    skc_uint            const block,
+                    skc_paths_copy_elem const elem)
+{
+  //
+  // write first elem to map -- FIXME -- this is a little nasty
+  // because it relies on the the host handle always being the first
+  // word in the path header.
+  //
+  // OTOH, this is not unreasonable.  The alternative is to have a
+  // separate kernel initializing the map.
+  //
+#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
+  if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)
+#endif
+    {
+#if SKC_PATHS_COPY_ELEM_WORDS == 1
+      host_map[elem] = block; 
+#if 0
+      printf("[%u] = %u\n",elem,block);
+#endif
+#else
+      host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;
+#endif
+    }
+}
+
+//
+//
+//
+
+void
+skc_copy_head(__global skc_uint                  * const host_map,
+              skc_uint                             const block,
+              __global skc_paths_copy_elem       * const bp_elems, // to
+              skc_uint                             const bp_elems_idx,
+              __global skc_block_id_t      const * const bp_ids,
+              skc_uint                             const bp_reads,
+              skc_uint                             const bp_idx_mask,
+              __global skc_paths_copy_elem const * const pb_elems, // from
+              skc_uint                             const pb_elems_idx,
+              skc_uint                             const pb_rolling)
+{
+  //
+  // if there are more path header words than there are
+  // threads-per-block then we can just copy the initial header words
+  //
+#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )
+  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+    {
+      skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];
+
+      (bp_elems+bp_elems_idx)[ii] = elem;
+
+      if (ii == 0) {
+        skc_host_map_update(host_map,block,elem);
+      }
+    }
+#endif
+
+  //
+  // this is similar to copy node but the first H words of the path
+  // header are not modified and simply copied
+  //
+  for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+    {
+      skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
+
+#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )
+      if (ii == 0) {
+        skc_host_map_update(host_map,block,elem);
+      }
+#endif
+      // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
+      skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
+
+      //
+      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
+      // will _always_ be safe as long as we don't use the loaded
+      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
+      // of iterating over the vector components.
+      //
+
+      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
+
+      // FIXME -- MIX MIX MIX MIX / SELECT
+
+      // only convert if original elem is not invalid
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \
+        skc_block_id_t const b = bp_ids[bp_idx C];                      \
+        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);           \
+      }
+
+      // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);
+
+      SKC_PATHS_COPY_ELEM_EXPAND();
+
+      // store the elem back
+      (bp_elems+bp_elems_idx)[ii] = elem;
+    }
+
+  //
+  // the remaining words are treated like a node
+  //
+  for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+    {
+      // load block_id_tag words
+      skc_paths_copy_elem elem   = (pb_elems+pb_elems_idx)[ii];
+
+      // calculate ahead of time
+      skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
+
+      //
+      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
+      // will _always_ be safe as long as we don't use the loaded
+      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
+      // of iterating over the vector components.
+      //
+
+      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
+
+      // only convert if original elem is not invalid
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                 \
+      if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \
+        skc_block_id_t const b = bp_ids[bp_idx C];              \
+        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \
+      }
+
+      // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);
+
+      SKC_PATHS_COPY_ELEM_EXPAND();
+
+      // store the elem
+      (bp_elems+bp_elems_idx)[ii] = elem;
+    }
+}
+
+//
+// FIXME -- pack some of these constant integer args in a vec or struct
+//
+
+__kernel
+SKC_PATHS_COPY_KERNEL_ATTRIBS
+void
+skc_kernel_paths_copy
+(__global skc_uint                        * const host_map,
+
+ __global skc_block_id_t            const * const bp_ids,
+ __global skc_paths_copy_elem             * const bp_elems,
+ skc_uint                                   const bp_idx_mask, // pow2 modulo mask for block pool ring
+
+ __global skc_uint                  const * const bp_alloc,    // block pool ring base
+ skc_uint                                   const bp_alloc_idx,// which subbuf
+
+ __global union skc_tagged_block_id const * const pb_cmds,
+ __global skc_paths_copy_elem       const * const pb_elems,
+
+ skc_uint                                   const pb_size,     // # of commands/blocks in buffer
+ skc_uint                                   const pb_rolling,  // shifted rolling counter base
+
+ skc_uint                                   const pb_prev_from,
+ skc_uint                                   const pb_prev_span,
+ skc_uint                                   const pb_curr_from)
+{
+  //
+  // THERE ARE 3 TYPES OF PATH COPYING COMMANDS:
+  //
+  // - HEAD
+  // - NODE
+  // - SEGS
+  //
+  // THESE ARE SUBGROUP ORIENTED KERNELS
+  //
+  // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS
+  //
+
+  //
+  // It's likely that peak bandwidth is achievable with a single
+  // workgroup.
+  //
+  // So let's keep the grids modestly sized and for simplicity and
+  // portability, let's assume that a single workgroup can perform all
+  // steps in the copy.
+  //
+  // Launch as large of a workgroup as possiblex
+  //
+  // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL
+  // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS
+  // 3. FOR EACH COMMAND:
+  //      - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.
+  //      - NODE: CONVERT AND COPY B INDICES
+  //      - SEGS: BULK COPY
+  //
+  // B : number of words in block -- always pow2
+  // W : intelligently/arbitrarily chosen factor of B -- always pow2
+  //
+
+  //
+  // There are several approaches to processing the commands:
+  //
+  // 1. B threads are responsible for one block. All threads broadcast
+  //    load a single command word. Workgroup size must be a facpb_elemsr of
+  //    B.
+  //
+  // 2. W threads process an entire block. W will typically be the
+  //    device's subgroup/warp/wave width. W threads broadcast load a
+  //    single command word.
+  //
+  // 3. W threads process W blocks. W threads load W command words and
+  //    process W blocks.
+  //
+  // Clearly (1) has low I/O intensity but will achieve high
+  // parallelism by activating the most possible threads. The downside
+  // of this kind of approach is that the kernel will occupy even a
+  // large GPU with low intensity work and reduce opportunities for
+  // concurrent kernel execution (of other kernels).
+  //
+  // See Vasily Volkov's CUDA presentation describing these tradeoffs.
+  //
+  // Note that there are many other approaches.  For example, similar
+  // pb_elems (1) but each thread loads a pow2 vector of block data.
+  //
+
+  // load the copied atomic read "base" from gmem
+  skc_uint const bp_reads = bp_alloc[bp_alloc_idx];
+  // will always be less than 2^32
+  skc_uint const gid      = get_global_id(0);
+  // every subgroup/simd that will work on the block loads the same command
+  skc_uint const sg_idx   = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;
+  // path builder data can be spread across two spans
+  skc_uint       pb_idx   = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);
+
+  // no need pb_elems make this branchless
+  if (pb_idx >= pb_size)
+    pb_idx -= pb_size;
+
+  // broadcast load the command
+  union skc_tagged_block_id const pb_cmd       = pb_cmds[pb_idx];
+
+  // what do we want pb_elems do with this block?
+  skc_cmd_paths_copy_tag    const tag          = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);
+
+  // compute offset from rolling base to get index into block pool ring allocation
+  skc_uint                  const bp_off       = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);
+
+  // convert the pb_cmd's offset counter pb_elems a block id
+  skc_block_id_t            const block        = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);
+
+#if 0
+  if (get_sub_group_local_id() == 0) {
+    printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);
+    printf("< %8u >\n",block);
+  }
+#endif
+
+  // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()
+  skc_uint                 const tid          = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;
+
+  // calculate bp_elems (to) / pb_elems (from)
+  skc_uint                 const bp_elems_idx = block  * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;
+  skc_uint                 const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK    + tid;
+
+  if      (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)
+    {
+#if 0
+      if (tid == 0)
+        printf("%3u, segs\n",bp_off);
+#endif
+      skc_copy_segs(bp_elems,
+                    bp_elems_idx,
+                    pb_elems,
+                    pb_elems_idx);
+    }
+  else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)
+    {
+#if 0
+      if (tid == 0)
+        printf("%3u, NODE\n",bp_off);
+#endif
+      skc_copy_node(bp_elems, // to
+                    bp_elems_idx,
+                    bp_ids,
+                    bp_reads,
+                    bp_idx_mask,
+                    pb_elems, // from
+                    pb_elems_idx,
+                    pb_rolling);
+    }
+  else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)
+    {
+#if 0
+      if (tid == 0)
+        printf("%3u, HEAD\n",bp_off);
+#endif
+      skc_copy_head(host_map,
+                    block,
+                    bp_elems, // to
+                    bp_elems_idx,
+                    bp_ids,
+                    bp_reads,
+                    bp_idx_mask,
+                    pb_elems, // from
+                    pb_elems_idx,
+                    pb_rolling);
+    }
+}
+
+//
+//
+//
+
+__kernel
+SKC_PATHS_ALLOC_KERNEL_ATTRIBS
+void
+skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,
+                       __global skc_uint          * const bp_alloc,
+                       skc_uint                     const bp_alloc_idx,
+                       skc_uint                     const pb_cmd_count)
+{
+  //
+  // allocate blocks in block pool
+  //
+  skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);
+
+  // store in slot
+  bp_alloc[bp_alloc_idx] = reads;
+
+#if 0
+  printf("pc: %8u + %u\n",reads,pb_cmd_count);
+#endif
+}
+
+//
+//
+//