1 files changed, 1443 insertions, 0 deletions
diff --git a/src/compute/skc/platforms/cl_12/path_builder_cl_12.c b/src/compute/skc/platforms/cl_12/path_builder_cl_12.c
new file mode 100644
index 0000000000..e915dffada
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/path_builder_cl_12.c
@@ -0,0 +1,1443 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <float.h>
+#include <stdio.h>
+
+#include "common/cl/assert_cl.h"
+
+#include "context.h"
+#include "handle.h"
+#include "grid.h"
+#include "path.h"
+#include "path_builder.h"
+
+#include "config_cl.h"
+#include "export_cl_12.h"
+#include "runtime_cl_12.h"
+#include "path_builder_cl_12.h"
+
+//
+// OpenCL 1.2 devices support mapping of buffers into the host address
+// space.
+//
+// Mapped buffers must be aligned on MIN_DATA_TYPE_ALIGN_SIZE bit
+// boundary (e.g. 128 bytes).  This complicates coordinating sharing
+// of data between the host and the device.
+//
+// Some OpenCL 2.0 devices support fine-grained shared virtual memory
+// pointers with byte-addressing and allow simpler coordination
+// strategies at the cost of maintaining cache coherency.
+//
+// The path builder is focused on moving bulk path data from the host
+// into the device-managed "block" memory pool and arranging it into a
+// SIMT/SIMD-friendly data structure that can be efficiently read by
+// the rasterizer.
+//
+// Note that one simplifying assumption is that the maximum length of
+// a *single* path can't be larger than what fits in the single extent
+// (which is split into M subbuffers).  This would be a very long path
+// and a legitimate size limitation.
+//
+// For some systems, it may be appropriate to never pull path data
+// into the device-managed block pool and instead present the path
+// data to the device in a temporarily available allocated memory
+// "zone" of paths that can be discarded all at once.
+//
+// For other systems, it may be appropriate to simply copy the path
+// data from host to device.
+//
+// But the majority of OpenCL (and VK, MTL, DX12) devices we'll be
+// targeting support basic map/unmap functionality similar to OpenCL
+// 1.2.  Furthermore, not all OpenCL 2.0 devices support fine-grained
+// sharing of memory and still require a map/unmap step... but note
+// that they all support byte-aligned mapping and subbuffers.
+//
+// The general strategy that this particular CL_12 implementation uses
+// is to allocate a large mappable bulk-data path buffer and an
+// auxilary mappable command buffer.
+//
+// The buffers are split into a reasonable number of properly aligned
+// subbuffers to enable simultaneous host and device access.
+//
+
+//
+// Blocks:
+//   1 extent
+//   M mapped subbuffers (configurable) to allow for concurrency
+//
+// Commands:
+//   1 extent
+//   M mapped subbuffers (configurable) to allow for concurrency
+//
+// Spans:
+//   M hi/lo structures
+//
+// { cl_sub, void*, event, base }
+//
+// - size of sub buffer
+// - remaining
+//
+// - counts
+//
+
+//
+// For any kernel launch, at most one path will be discontiguous and
+// defined across two sub-buffers.
+//
+// Nodes are updated locally until full and then stored so they will
+// never be incomplete.  Headers are stored locally until the path is
+// ended so they will never be incomplete.
+//
+// A line, quad or cubic acquires 4/6/8 segments which may be spread
+// across one or more congtiguous blocks.
+//
+// If a flush() occurs then the remaining columns of multi-segment
+// paths are initialized with zero-length line, quad, cubic elements.
+//
+// Every block's command word has a type and a count acquired from a
+// rolling counter.
+//
+// The kernel is passed two spans of blocks { base, count } to
+// process.  The grid is must process (lo.count + hi.count) blocks.
+//
+
+struct skc_subbuffer_blocks
+{
+  cl_mem   device;
+  void *   host;
+};
+
+struct skc_subbuffer_cmds
+{
+  cl_mem   device;
+  void *   host;
+  cl_event map;
+};
+
+//
+// ringdex is an index with range [0, blocks-per-subbuf * subbufs-per-buffer )
+//
+
+typedef skc_uint skc_ringdex_t;
+
+union skc_ringdex_expand
+{
+  div_t      qr;
+
+  struct {
+#ifndef SKC_DIV_REM_BEFORE_QUOT // offsetof(div_t,quot) != 0
+    skc_uint subbuf;
+    skc_uint block;
+#else
+    skc_uint block;
+    skc_uint subbuf;
+#endif
+  };
+};
+
+//
+// this record is executed by the grid
+//
+
+struct skc_release_record
+{
+  struct skc_path_builder_impl * impl; // back pointer to impl
+
+  skc_grid_t                     grid; // pointer to scheduled grid
+
+  skc_uint                       from; // inclusive starting index   : [from,to)
+  skc_uint                       to;   // non-inclusive ending index : [from,to)
+};
+
+//
+//
+//
+
+struct skc_path_builder_impl
+{
+  struct skc_path_builder       * path_builder;
+
+  struct skc_runtime            * runtime;
+
+  cl_command_queue                cq;
+
+  struct {
+    cl_kernel                     alloc;
+    cl_kernel                     copy;
+  } kernels;
+
+  //
+  // FIXME -- make this pointer to constant config
+  //
+  // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
+  struct {
+    skc_uint                      subbufs;  // how many subbufs in the buffer?
+
+    struct {
+      skc_uint                    buffer;   // how many blocks in the buffer?
+      skc_uint                    subbuf;   // how many blocks in a   subbuf?
+    } blocks_per;
+  } ring;
+  //
+  // ^^^^^^^^^^^ don't duplicate these constants ^^^^^^^^^^^^^^^^^^
+  //
+
+  struct {
+    cl_mem                        buffer;   // backing buffer for blocks
+    struct skc_subbuffer_blocks * subbufs;  // array of structures
+  } blocks;
+
+  struct {
+    cl_mem                        buffer;   // backing buffer for commands
+    struct skc_subbuffer_cmds   * subbufs;  // array of structures
+  } cmds;
+
+  struct {
+    struct skc_release_record   * records;  // max release records is equal to max subbufs
+    skc_path_t                  * paths;    // max paths is less than or equal to max commands
+  } release;
+
+  cl_mem                          reads;    // each kernel only requires one word to store the block pool "base"
+
+  struct {
+    skc_uint                      rolling;  // rolling counter used by cmds to map to block pool alloc
+    skc_ringdex_t                 from;
+    skc_ringdex_t                 to;
+  } prev;
+
+  struct {
+    skc_ringdex_t                 from;
+    skc_ringdex_t                 to;
+  } curr;
+
+  struct {
+    struct skc_path_head        * head;     // pointer to local path header -- not written until path end
+    struct skc_path_node        * node;     // pointer to local node -- may alias head until head is full
+
+    struct {
+      skc_uint                    rolling;  // rolling counter of wip node -- valid after one node is allocated
+      union skc_tagged_block_id * next;     // next slot in node -- may initially point to head.ids
+      skc_uint                    rem;      // how many id slots left in node block
+    } ids;
+
+    struct {
+      skc_uint                    rem;      // how many subblocks left in block?
+      skc_uint                    rolling;  // rolling counter of block of subblocks
+      float                     * next;     // next subblock in current subblock block
+      skc_uint                    idx;      // index of next subblock
+    } subblocks;
+
+    struct {
+      skc_uint                    one;      // .block = 1
+      skc_uint                    next;     // rolling counter used by cmds to map to block pool alloc
+    } rolling;
+
+    skc_ringdex_t                 to;       // ringdex of _next_available_ command/block in ring -- FIXME -- should be current
+  } wip;
+};
+
+//
+// FIXME -- move to a pow2 subbuffer size and dispense with division
+// and modulo operations
+//
+
+static
+union skc_ringdex_expand
+skc_ringdex_expand(struct skc_path_builder_impl * const impl, 
+                   skc_ringdex_t                  const ringdex)
+{
+  return (union skc_ringdex_expand){
+    .qr = div(ringdex,impl->ring.blocks_per.subbuf)
+  };
+}
+
+static
+void
+skc_ringdex_wip_to_block_inc(struct skc_path_builder_impl * const impl)
+{
+  //
+  // FIXME - which is faster?
+  //
+#if 1
+  impl->wip.to  = (impl->wip.to + 1) % impl->ring.blocks_per.buffer;
+#else
+  impl->wip.to -= (impl->wip.to < impl->ring.blocks_per.buffer) ? -1 : impl->wip.to;
+#endif
+
+  // this path is too long -- for now assert() and die
+  assert(impl->wip.to != impl->curr.from);
+}
+
+static
+skc_ringdex_t
+skc_ringdex_span(struct skc_path_builder_impl * const impl,
+                 skc_ringdex_t                  const from,
+                 skc_ringdex_t                  const to)
+{
+  return (to - from) % impl->ring.blocks_per.buffer;
+}
+
+static
+void
+skc_ringdex_wip_to_subbuf_inc(struct skc_path_builder_impl * const impl)
+{
+  union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to);
+
+  // nothing to do if this is the first block in the subbuf
+  if (to.block == 0)
+    return;
+
+  skc_uint const new_subbuf = (to.subbuf + 1) % impl->ring.subbufs;
+
+  // otherwise increment and mod
+  impl->wip.to = new_subbuf * impl->ring.blocks_per.subbuf;
+}
+
+static
+skc_bool
+skc_ringdex_curr_is_equal(struct skc_path_builder_impl * const impl)
+{
+  return impl->curr.from == impl->curr.to;
+}
+
+static
+skc_bool
+skc_ringdex_prev_is_equal(struct skc_path_builder_impl * const impl)
+{
+  return impl->prev.from == impl->prev.to;
+}
+
+static
+skc_uint
+skc_ringdex_dont_map_last(struct skc_path_builder_impl * const impl, 
+                          skc_uint                       const to_block)
+{
+  // no blocks acquired OR this is last block in subbuf
+  return !((impl->wip.to == impl->curr.to) || (to_block == 0));
+}
+
+//
+//
+//
+
+static
+struct skc_release_record *
+skc_release_curr(struct skc_path_builder_impl * const impl)
+{
+  union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from);
+
+  return impl->release.records + curr_from.subbuf;
+}
+
+//
+// FIXME -- get rid of all distant config references -- grab them at all at creation time
+//
+
+static
+void
+skc_path_builder_pfn_begin(struct skc_path_builder_impl * const impl)
+{
+  // init header counters // { handle, blocks, nodes, prims }
+  impl->wip.head->header = (union skc_path_header){
+    .handle = 0,
+    .blocks = 0,
+    .nodes  = 0,
+    .prims  = 0
+  };
+
+  // FIXME -- BOUNDS SHOULD USE SIMD4 TRICK AND NEGATE ONE OF THE CORNERS
+  impl->wip.head->bounds  = (union skc_path_bounds){ +FLT_MIN, +FLT_MIN, -FLT_MIN, -FLT_MIN };
+
+  // point wip ids at local head node
+  impl->wip.ids.next      = impl->wip.head->tag_ids; // point to local head node
+  impl->wip.ids.rem       = impl->runtime->config->block.words - SKC_PATH_HEAD_WORDS; // FIXME -- save this constant somewhere
+
+  // start with no subblocks
+  impl->wip.subblocks.rem = 0;
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_finalize_node(struct skc_path_builder_impl * const impl)
+{
+#if 1
+  //
+  // FIXME -- a Duff's device might be optimal here but would have to
+  // be customized per device since node's could be 16-128+ words
+  //
+  while (impl->wip.ids.rem > 0)
+    {
+      impl->wip.ids.rem      -= 1;
+      impl->wip.ids.next->u32 = SKC_TAGGED_BLOCK_ID_INVALID;
+      impl->wip.ids.next     += 1;
+    }
+#else
+  memset(&impl->wip.ids.next->u32,
+         SKC_TAGGED_BLOCK_ID_INVALID, // 0xFF
+         sizeof(impl->wip.ids.next->u32) * impl->wip.ids.rem);
+
+  impl->wip.ids.next += impl->wip.ids.rem;
+  impl->wip.ids.rem   = 0;
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_zero_float(skc_float * p, skc_uint rem)
+{
+  memset(p,0,sizeof(*p)*rem);
+}
+
+static
+void
+skc_path_builder_finalize_subblocks(struct skc_path_builder * const path_builder)
+{
+  //
+  // FIXME -- it might be more performant to zero the remaining
+  // columns in a subblock -- a subblock at a time -- instead of the
+  // same column across all the subblocks
+  //
+#if 0
+  while (path_builder->line.rem > 0)
+    {
+      --path_builder->line.rem;
+
+      *path_builder->line.coords[0]++ = 0.0f;
+      *path_builder->line.coords[1]++ = 0.0f;
+      *path_builder->line.coords[2]++ = 0.0f;
+      *path_builder->line.coords[3]++ = 0.0f;
+    }
+
+  while (path_builder->quad.rem > 0)
+    {
+      --path_builder->quad.rem;
+
+      *path_builder->line.coords[0]++ = 0.0f;
+      *path_builder->line.coords[1]++ = 0.0f;
+      *path_builder->line.coords[2]++ = 0.0f;
+      *path_builder->line.coords[3]++ = 0.0f;
+      *path_builder->line.coords[4]++ = 0.0f;
+      *path_builder->line.coords[5]++ = 0.0f;
+    }
+
+  while (path_builder->cubic.rem > 0)
+    {
+      --path_builder->cubic.rem;
+
+      *path_builder->line.coords[0]++ = 0.0f;
+      *path_builder->line.coords[1]++ = 0.0f;
+      *path_builder->line.coords[2]++ = 0.0f;
+      *path_builder->line.coords[3]++ = 0.0f;
+      *path_builder->line.coords[4]++ = 0.0f;
+      *path_builder->line.coords[5]++ = 0.0f;
+      *path_builder->line.coords[6]++ = 0.0f;
+      *path_builder->line.coords[7]++ = 0.0f;
+    }
+#else
+  if (path_builder->line.rem > 0)
+    {
+      skc_zero_float(path_builder->line.coords[0],path_builder->line.rem);
+      skc_zero_float(path_builder->line.coords[1],path_builder->line.rem);
+      skc_zero_float(path_builder->line.coords[2],path_builder->line.rem);
+      skc_zero_float(path_builder->line.coords[3],path_builder->line.rem);
+
+      path_builder->line.rem = 0;
+    }
+
+  if (path_builder->quad.rem > 0)
+    {
+      skc_zero_float(path_builder->quad.coords[0],path_builder->quad.rem);
+      skc_zero_float(path_builder->quad.coords[1],path_builder->quad.rem);
+      skc_zero_float(path_builder->quad.coords[2],path_builder->quad.rem);
+      skc_zero_float(path_builder->quad.coords[3],path_builder->quad.rem);
+      skc_zero_float(path_builder->quad.coords[4],path_builder->quad.rem);
+      skc_zero_float(path_builder->quad.coords[5],path_builder->quad.rem);
+
+      path_builder->quad.rem = 0;
+    }
+
+  if (path_builder->cubic.rem > 0)
+    {
+      skc_zero_float(path_builder->cubic.coords[0],path_builder->cubic.rem);
+      skc_zero_float(path_builder->cubic.coords[1],path_builder->cubic.rem);
+      skc_zero_float(path_builder->cubic.coords[2],path_builder->cubic.rem);
+      skc_zero_float(path_builder->cubic.coords[3],path_builder->cubic.rem);
+      skc_zero_float(path_builder->cubic.coords[4],path_builder->cubic.rem);
+      skc_zero_float(path_builder->cubic.coords[5],path_builder->cubic.rem);
+      skc_zero_float(path_builder->cubic.coords[6],path_builder->cubic.rem);
+      skc_zero_float(path_builder->cubic.coords[7],path_builder->cubic.rem);
+
+      path_builder->cubic.rem = 0;
+    }
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_unmap(struct skc_path_builder_impl * const impl,
+                            skc_uint                             from,
+                            skc_uint                             to)
+{
+  // to might be out of range
+  to = to % impl->ring.subbufs;
+
+#if 0
+  fprintf(stderr,"unmap: [%2u,%2u)\n",from,to);
+#endif
+  
+  while (from != to) // 'to' might be out of range
+    {
+      // bring 'from' back in range
+      from = from % impl->ring.subbufs;
+
+      struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from;
+      struct skc_subbuffer_cmds   * const cmds   = impl->cmds  .subbufs + from;
+
+      cl(EnqueueUnmapMemObject(impl->cq,
+                               blocks->device,
+                               blocks->host,
+                               0,NULL,NULL));
+
+      cl(EnqueueUnmapMemObject(impl->cq,
+                               cmds->device,
+                               cmds->host,
+                               0,NULL,NULL));
+
+      // bring from back in range
+      from = ++from % impl->ring.subbufs;
+    }
+}
+
+//
+// FIXME -- reuse this in create()
+//
+
+static
+void
+skc_path_builder_impl_map(struct skc_path_builder_impl * const impl,
+                          skc_uint                             from,
+                          skc_uint                             to)
+{
+  // to might be out of range
+  to = to % impl->ring.subbufs;
+
+#if 0
+  fprintf(stderr,"  map: [%2u,%2u)\n",from,to);
+#endif
+
+  while (from != to)
+    {
+      cl_int cl_err;
+
+      struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from;
+      struct skc_subbuffer_cmds   * const cmds   = impl->cmds  .subbufs + from;
+
+      blocks->host = clEnqueueMapBuffer(impl->cq,
+                                        blocks->device,
+                                        CL_FALSE,
+                                        CL_MAP_WRITE_INVALIDATE_REGION,
+                                        0,impl->runtime->config->paths_copy.block.subbuf,
+                                        0,NULL,NULL,
+                                        &cl_err); cl_ok(cl_err);
+
+      cl(ReleaseEvent(cmds->map));
+
+      cmds->host   = clEnqueueMapBuffer(impl->cq,
+                                        cmds->device,
+                                        CL_FALSE,
+                                        CL_MAP_WRITE_INVALIDATE_REGION,
+                                        0,impl->runtime->config->paths_copy.command.subbuf,
+                                        0,NULL,&cmds->map,
+                                        &cl_err); cl_ok(cl_err);
+
+      // bring from back in range
+      from = ++from % impl->ring.subbufs;
+    }
+  //
+  // FIXME -- when we switch to out of order queues we'll need a barrier here
+  //
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_release_dispose(struct skc_release_record    * const release,
+                                 struct skc_path_builder_impl * const impl)
+{
+  struct skc_runtime * runtime = impl->runtime;
+
+  if (release->from <= release->to) // no wrap
+    {
+      skc_path_t const * paths = impl->release.paths + release->from;
+      skc_uint           count = release->to         - release->from;
+
+      skc_grid_deps_unmap(runtime->deps,paths,count);
+      skc_runtime_path_device_release(runtime,paths,count);
+    }
+  else // from > to implies wrap
+    {
+      skc_path_t const * paths_lo = impl->release.paths + release->from;
+      skc_uint           count_lo = impl->ring.blocks_per.buffer - release->from;
+
+      skc_grid_deps_unmap(runtime->deps,paths_lo,count_lo);
+      skc_runtime_path_device_release(runtime,paths_lo,count_lo);
+
+      skc_grid_deps_unmap(runtime->deps,impl->release.paths,release->to);
+      skc_runtime_path_device_release(runtime,impl->release.paths,release->to);
+    }
+
+  release->to = release->from;
+}
+
+static
+void
+skc_path_builder_grid_pfn_dispose(skc_grid_t const grid)
+{
+  struct skc_release_record    * const release = skc_grid_get_data(grid);
+  struct skc_path_builder_impl * const impl    = release->impl;
+
+  skc_path_builder_release_dispose(release,impl);
+}
+
+static
+void
+// skc_path_builder_complete(struct skc_release_record * const release)
+skc_path_builder_complete(skc_grid_t grid)
+{
+  //
+  // notify deps that this grid is complete enough for other grids to
+  // proceed
+  //
+  // the path builder still has some cleanup to do before all its
+  // resources can be reused
+  //
+  skc_grid_complete(grid);
+}
+
+static
+void
+skc_path_builder_paths_copy_cb(cl_event event, cl_int status, skc_grid_t grid)
+{
+  SKC_CL_CB(status);
+  
+  struct skc_release_record * const release = skc_grid_get_data(grid);
+  
+  SKC_SCHEDULER_SCHEDULE(release->impl->runtime->scheduler,skc_path_builder_complete,grid);
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_grid_pfn_waiting(skc_grid_t const grid)
+{
+  struct skc_release_record    * const release = skc_grid_get_data(grid);
+  struct skc_path_builder_impl * const impl    = release->impl;
+
+  // 1. flush incomplete subblocks of path elements
+  // 2. unmap subbuffer on cq.unmap
+  // 3. flush cq.unmap
+  // 4. launch kernel on cq.kernel but wait for unmap completion
+  // 5. flush cq.kernel
+  // 6. remap relevant subbuffers on cq.map but wait for kernel completion
+  // 7. flush cq.map
+
+  //
+  // FIXME -- can be smarter about flushing if the wip paths are not
+  // in the same subbuf as curr.to
+  //
+  // THIS IS IMPORTANT TO FIX
+  //
+
+  // flush incomplete subblocks
+  skc_path_builder_finalize_subblocks(impl->path_builder);
+
+  //
+  // get range of subbufs that need to be unmapped
+  //
+  // note that impl->prev subbufs have already been unmapped
+  //
+  union skc_ringdex_expand       curr_from  = skc_ringdex_expand(impl,impl->curr.from);
+  union skc_ringdex_expand       curr_to    = skc_ringdex_expand(impl,impl->curr.to);
+  skc_uint                 const is_partial = curr_to.block > 0;
+  skc_uint                 const unmap_to   = curr_to.subbuf + is_partial;
+
+  //
+  // unmap all subbufs in range [from,to)
+  //
+  skc_path_builder_impl_unmap(impl,curr_from.subbuf,unmap_to);
+
+  //
+  // launch kernels
+  //
+  skc_uint const pb_prev_span = skc_ringdex_span(impl,impl->prev.from,impl->prev.to);
+  skc_uint const pb_curr_span = skc_ringdex_span(impl,impl->curr.from,impl->curr.to);
+  skc_uint const pb_cmds      = pb_prev_span + pb_curr_span;
+
+  //
+  // 1) allocate blocks from pool
+  //
+
+  //
+  // FIXME -- pack integers into struct/vector
+  //
+  cl(SetKernelArg(impl->kernels.alloc,0,SKC_CL_ARG(impl->runtime->block_pool.atomics.drw)));
+  cl(SetKernelArg(impl->kernels.alloc,1,SKC_CL_ARG(impl->reads)));
+  cl(SetKernelArg(impl->kernels.alloc,2,SKC_CL_ARG(curr_from.subbuf)));
+  cl(SetKernelArg(impl->kernels.alloc,3,SKC_CL_ARG(pb_cmds)));
+
+  skc_device_enqueue_kernel(impl->runtime->device,
+                            SKC_DEVICE_KERNEL_ID_PATHS_ALLOC,
+                            impl->cq,
+                            impl->kernels.alloc,
+                            1,
+                            0,NULL,NULL);
+
+  //
+  // 2) copy blocks from unmapped device-accessible memory
+  //
+
+  //
+  // FIXME -- pack integers into struct/vector and reduce 13 arguments down to 7
+  //
+  cl(SetKernelArg(impl->kernels.copy, 0,SKC_CL_ARG(impl->runtime->handle_pool.map.drw)));
+
+  cl(SetKernelArg(impl->kernels.copy, 1,SKC_CL_ARG(impl->runtime->block_pool.ids.drw)));
+  cl(SetKernelArg(impl->kernels.copy, 2,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw)));
+  cl(SetKernelArg(impl->kernels.copy, 3,SKC_CL_ARG(impl->runtime->block_pool.size->ring_mask)));
+
+  cl(SetKernelArg(impl->kernels.copy, 4,SKC_CL_ARG(impl->reads)));
+  cl(SetKernelArg(impl->kernels.copy, 5,SKC_CL_ARG(curr_from.subbuf)));
+
+  cl(SetKernelArg(impl->kernels.copy, 6,SKC_CL_ARG(impl->cmds.buffer)));
+  cl(SetKernelArg(impl->kernels.copy, 7,SKC_CL_ARG(impl->blocks.buffer)));
+
+  cl(SetKernelArg(impl->kernels.copy, 8,SKC_CL_ARG(impl->ring.blocks_per.buffer)));
+  cl(SetKernelArg(impl->kernels.copy, 9,SKC_CL_ARG(impl->prev.rolling)));
+
+  cl(SetKernelArg(impl->kernels.copy,10,SKC_CL_ARG(impl->prev.from)));
+  cl(SetKernelArg(impl->kernels.copy,11,SKC_CL_ARG(pb_prev_span)));
+  cl(SetKernelArg(impl->kernels.copy,12,SKC_CL_ARG(impl->curr.from)));
+
+  cl_event complete;
+
+  skc_device_enqueue_kernel(impl->runtime->device,
+                            SKC_DEVICE_KERNEL_ID_PATHS_COPY,
+                            impl->cq,
+                            impl->kernels.copy,
+                            pb_cmds,
+                            0,NULL,&complete);
+
+  // set a callback on completion
+  cl(SetEventCallback(complete,CL_COMPLETE,
+                      skc_path_builder_paths_copy_cb,
+                      grid));
+
+  // immediately release
+  cl(ReleaseEvent(complete));
+
+  //
+  // remap as many subbuffers as possible after the kernel completes
+  //
+  // note that remaps are async and enqueued on the same command queue
+  // as the kernel launch
+  //
+  // we can't remap subbuffers that are in the possibly empty range
+  //
+  // cases:
+  //
+  //   - curr.to == wip.to which means no blocks have been acquired
+  //   - curr.to points to first block in (next) subbuf
+  //   - otherwise, wip acquired blocks in the curr.to subbuf
+  //
+  // check for these first 2 cases!
+  //
+  union skc_ringdex_expand const prev_from = skc_ringdex_expand(impl,impl->prev.from);
+  skc_uint                 const no_wip    = impl->curr.to == impl->wip.to;
+  skc_uint                       map_to    = curr_to.subbuf + (is_partial && no_wip);
+
+  // remap all subbufs in range [from,to)
+  skc_path_builder_impl_map(impl,prev_from.subbuf,map_to);
+
+  // flush command queue
+  cl(Flush(impl->cq));
+
+  // save rolling
+  impl->prev.rolling = impl->wip.rolling.next;
+
+  // update prev and curr
+  if (no_wip)
+    {
+      //
+      // if there was no wip then round up to the next subbuf
+      //
+      skc_ringdex_wip_to_subbuf_inc(impl);
+    
+      //
+      // update prev/curr with with incremented wip
+      //
+      impl->prev.from = impl->prev.to = impl->wip.to;
+      impl->curr.from = impl->curr.to = impl->wip.to;
+    }
+  else
+    {
+      //
+      // update prev with wip partials
+      //
+      impl->prev.from    = impl->curr.to;
+      impl->prev.to      = impl->wip .to;
+
+      //
+      // start curr on a new subbuf boundary
+      //
+      skc_ringdex_wip_to_subbuf_inc(impl);
+
+      impl->curr.from    = impl->wip.to;
+      impl->curr.to      = impl->wip.to;
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_acquire_subbuffer(struct skc_path_builder_impl * const impl,
+                                        skc_uint                       const subbuf)
+{
+  //
+  // FIXME -- move to a power-of-two subbuf size and kickstart path
+  // copies as early as possible
+  //
+  // FIXME -- the subbufs "self-clock" (flow control) the kernel
+  // launches and accounting.  Combine all the subbuffers and release
+  // records into a single indexable struct instead of 3.
+  //
+  struct skc_subbuffer_cmds * const sc        = impl->cmds.subbufs    + subbuf;
+  struct skc_release_record * const release   = impl->release.records + subbuf;
+  struct skc_scheduler      * const scheduler = impl->runtime->scheduler;
+
+  // can't proceed until the paths have been released
+  SKC_SCHEDULER_WAIT_WHILE(scheduler,release->from != release->to);
+
+  // throw in a scheduler yield ... FIXME -- get rid of
+  skc_scheduler_yield(scheduler);
+
+  // can't proceed until the subbuffer is mapped
+  cl(WaitForEvents(1,&sc->map));
+}
+
+//
+//
+//
+
+static
+union skc_ringdex_expand
+skc_path_builder_impl_acquire_block(struct skc_path_builder_impl * const impl)
+{
+  // break ringdex into components
+  union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to);
+
+  // does wip ringdex point to a new subbuffer?
+  if (to.block == 0)
+    {
+      // potentially spin/block waiting for subbuffer
+      skc_path_builder_impl_acquire_subbuffer(impl,to.subbuf);
+    }
+
+  // post increment wip.to
+  skc_ringdex_wip_to_block_inc(impl);
+
+  return to;
+}
+
+//
+//
+//
+
+static
+skc_uint
+skc_rolling_block(skc_uint const rolling, skc_uint const tag)
+{
+  return rolling | tag;
+}
+
+static
+skc_uint
+skc_rolling_subblock(skc_uint const rolling, skc_uint const subblock, skc_uint const tag)
+{
+  return rolling | (subblock << SKC_TAGGED_BLOCK_ID_BITS_TAG) | tag;
+}
+
+static
+void
+skc_rolling_inc(struct skc_path_builder_impl * const impl)
+{
+  impl->wip.rolling.next += impl->wip.rolling.one;
+}
+
+//
+//
+//
+
+static
+void *
+skc_path_builder_impl_new_command(struct skc_path_builder_impl * const impl,
+                                  skc_uint                       const rolling,
+                                  skc_cmd_paths_copy_tag         const tag)
+{
+  // bump blocks count
+  impl->wip.head->header.blocks += 1;
+
+  // acquire a block
+  union skc_ringdex_expand    const to          = skc_path_builder_impl_acquire_block(impl);
+
+  // make a pointer
+  union skc_tagged_block_id * const cmds_subbuf = impl->cmds.subbufs[to.subbuf].host;
+
+  // store command for block
+  cmds_subbuf[to.block].u32 = skc_rolling_block(rolling,tag);
+
+#if 0
+  // store command for block
+  cmds_subbuf[to.block].u32 = skc_rolling_block(impl->wip.rolling.next,tag);
+
+  // increment rolling
+  skc_rolling_inc(impl);
+#endif
+
+  // return pointer to block
+  float * const blocks_subbuf = impl->blocks.subbufs[to.subbuf].host;
+
+  // FIXME -- make it easier to get config constant
+  return blocks_subbuf + (to.block * impl->runtime->config->block.words);
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_flush_node(struct skc_path_builder_impl * const impl)
+{
+  // store command to subbuf and get pointer to blocks subbuf
+  void * const block = skc_path_builder_impl_new_command(impl,impl->wip.ids.rolling,
+                                                         SKC_CMD_PATHS_COPY_TAG_NODE);
+
+  // copy head to blocks subbuf -- write-only
+  memcpy(block,impl->wip.node,impl->runtime->config->block.bytes);
+}
+
+static
+void
+skc_path_builder_impl_flush_head(struct skc_path_builder_impl * const impl)
+{
+  // store command to subbuf and get pointer to blocks subbuf
+  void * const block = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next,
+                                                         SKC_CMD_PATHS_COPY_TAG_HEAD);
+
+  // copy head to blocks subbuf -- write-only
+  memcpy(block,impl->wip.head,impl->runtime->config->block.bytes);
+
+  // increment rolling
+  skc_rolling_inc(impl);
+
+  // the 'to' index is non-inclusive so assign wip.to after flush_head
+  impl->curr.to = impl->wip.to;
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_new_node_block(struct skc_path_builder_impl * const impl)
+{
+  // update final block id in node
+  impl->wip.ids.next->u32 = skc_rolling_block(impl->wip.rolling.next,SKC_BLOCK_ID_TAG_PATH_NEXT);
+
+  // if wip.ids is not the header then flush now full wip node
+  if (impl->wip.head->header.nodes > 0)
+    skc_path_builder_impl_flush_node(impl);
+
+  // bump node count
+  impl->wip.head->header.nodes += 1;
+
+  // save current rolling
+  impl->wip.ids.rolling = impl->wip.rolling.next;
+
+  // increment rolling
+  skc_rolling_inc(impl);
+
+  // update wip.ids.*
+  impl->wip.ids.next = impl->wip.node->tag_ids;
+  impl->wip.ids.rem  = impl->runtime->config->block.words;
+}
+
+static
+void
+skc_path_builder_impl_new_segs_block(struct skc_path_builder_impl * const impl)
+{
+  impl->wip.subblocks.rem     = impl->runtime->config->block.subblocks; // FIXME -- move constants closer to structure
+  impl->wip.subblocks.rolling = impl->wip.rolling.next;
+  impl->wip.subblocks.next    = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next,
+                                                                  SKC_CMD_PATHS_COPY_TAG_SEGS);
+  impl->wip.subblocks.idx     = 0;
+
+  // increment rolling
+  skc_rolling_inc(impl);
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_acquire_subblocks(struct skc_path_builder_impl * const impl,
+                                        skc_block_id_tag                     tag,
+                                        skc_uint                             vertices,
+                                        float * *                            subblocks)
+{
+  //
+  // FIRST TAG RECORDS THE ELEMENT TYPE
+  //
+  while (true)
+    {
+      // if only one block id left in node then acquire new node block
+      // and append its block id as with a next tag
+      if (impl->wip.ids.rem == 1)
+        skc_path_builder_impl_new_node_block(impl);
+
+      // if zero subblocks left then acquire a new subblock block and
+      // append its block id
+      if (impl->wip.subblocks.rem == 0)
+        skc_path_builder_impl_new_segs_block(impl);
+
+      // save first command -- tag and subblocks may have been updated
+      impl->wip.ids.next->u32 = skc_rolling_subblock(impl->wip.subblocks.rolling,impl->wip.subblocks.idx,tag);
+
+      // increment node block subblock pointer
+      impl->wip.ids.next += 1;
+      impl->wip.ids.rem  -= 1;
+
+      // how many vertices can we store
+      skc_uint rem = min(vertices,impl->wip.subblocks.rem);
+
+      // decrement vertices
+      vertices                -= rem;
+      impl->wip.subblocks.rem -= rem;
+      impl->wip.subblocks.idx += rem;
+
+      // assign subblocks
+      do {
+        *subblocks++              = impl->wip.subblocks.next;
+        impl->wip.subblocks.next += impl->runtime->config->subblock.words;
+        // FIXME -- move constants closer to structure
+      } while (--rem > 0);
+
+      // anything left to do?
+      if (vertices == 0)
+        break;
+
+      // any tag after this will be a caboose command
+      tag = SKC_BLOCK_ID_TAG_PATH_NEXT;
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_pfn_end(struct skc_path_builder_impl * const impl, skc_path_t * const path)
+{
+  // finalize incomplete active subblocks -- we don't care about any
+  // remaining unused subblocks in block
+  skc_path_builder_finalize_subblocks(impl->path_builder);
+
+  // mark remaining wips.ids in the head or node as invalid
+  skc_path_builder_impl_finalize_node(impl);
+
+  // flush node if rem > 0 and node is not actually head
+  if (impl->wip.head->header.nodes >= 1)
+    skc_path_builder_impl_flush_node(impl);
+
+  // acquire path host id
+  *path = skc_runtime_handle_device_acquire(impl->runtime); // FIXME -- MAY WANT TO GRAB AN ID ON BEGIN
+
+  // save path host handle
+  impl->wip.head->header.handle = *path;
+
+  // flush head -- acquires a block and bumps head->header.blocks
+  skc_path_builder_impl_flush_head(impl);
+
+  // get current release
+  struct skc_release_record * const release = skc_release_curr(impl);
+
+  // acquire grid if null
+  if (release->grid == NULL)
+    {
+      release->grid =
+        SKC_GRID_DEPS_ATTACH(impl->runtime->deps,
+                             &release->grid, // NULL on start/force
+                             release,        // data payload
+                             skc_path_builder_grid_pfn_waiting,
+                             NULL,           // no execute pfn
+                             skc_path_builder_grid_pfn_dispose);
+    }
+
+  // update grid map
+  skc_grid_map(release->grid,*path);
+
+  // update path release
+  impl->release.paths[release->to] = *path;
+
+  // increment release.to
+  release->to = (release->to + 1) % impl->ring.blocks_per.buffer;
+
+  // add guard bit
+  *path |= SKC_TYPED_HANDLE_TYPE_IS_PATH;
+
+#if 1
+  //
+  // eager kernel launch?
+  //
+  {
+    union skc_ringdex_expand const curr_from = skc_ringdex_expand(impl,impl->curr.from);
+    union skc_ringdex_expand const curr_to   = skc_ringdex_expand(impl,impl->curr.to);
+
+    if (curr_from.subbuf != curr_to.subbuf)
+      {
+        skc_grid_start(release->grid);
+        // skc_scheduler_yield(impl->runtime->scheduler);
+      }
+  }
+#endif
+}
+
+//
+// FIXME -- clean up accessing of CONFIG constants in these 3 routines
+//
+
+static
+void
+skc_path_builder_pfn_new_line(struct skc_path_builder_impl * const impl)
+{
+  // acquire subblock pointers
+  skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_LINE,4,
+                                          impl->path_builder->line.coords);
+
+  // increment line count
+  impl->wip.head->header.prims += 1;
+
+  // update rem_count_xxx count
+  impl->path_builder->line.rem = impl->runtime->config->subblock.words;
+}
+
+static
+void
+skc_path_builder_pfn_new_quad(struct skc_path_builder_impl * const impl)
+{
+  // acquire subblock pointers
+  skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_QUAD,6,
+                                          impl->path_builder->quad.coords);
+
+  // increment line count
+  impl->wip.head->header.prims += 1;
+
+  // update rem_count_xxx count
+  impl->path_builder->quad.rem = impl->runtime->config->subblock.words;
+}
+
+static
+void
+skc_path_builder_pfn_new_cubic(struct skc_path_builder_impl * const impl)
+{
+  // acquire subblock pointers
+  skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_CUBIC,8,
+                                          impl->path_builder->cubic.coords);
+
+  // increment line count
+  impl->wip.head->header.prims += 1;
+
+  // update rem_count_xxx count
+  impl->path_builder->cubic.rem = impl->runtime->config->subblock.words;
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_pfn_release(struct skc_path_builder_impl * const impl)
+{
+  // decrement reference count
+  if (--impl->path_builder->refcount != 0)
+    return;
+
+  //
+  // otherwise, dispose of everything
+  //
+  struct skc_runtime * const runtime = impl->runtime;
+
+  // free path builder
+  skc_runtime_host_perm_free(impl->runtime,impl->path_builder);
+
+  // release cq
+  skc_runtime_release_cq_in_order(runtime,impl->cq);
+
+  // release kernels
+  cl(ReleaseKernel(impl->kernels.alloc));
+  cl(ReleaseKernel(impl->kernels.copy));
+
+  // free blocks extents
+  cl(ReleaseMemObject(impl->blocks.buffer));
+  skc_runtime_host_perm_free(runtime,impl->blocks.subbufs);
+
+  cl(ReleaseMemObject(impl->cmds.buffer));
+  skc_runtime_host_perm_free(runtime,impl->cmds.subbufs);
+
+  // free records
+  skc_runtime_host_perm_free(runtime,impl->release.records);
+  skc_runtime_host_perm_free(runtime,impl->release.paths);
+
+  // release staging head and node
+  skc_runtime_host_perm_free(runtime,impl->wip.head);
+  skc_runtime_host_perm_free(runtime,impl->wip.node);
+
+  // release reads scratch array
+  cl(ReleaseMemObject(impl->reads));
+
+  // for all subbuffers
+  //   unmap   subbuffer
+  //   release subbuffer
+  // printf("%s not releasing subbuffers\n",__func__);
+
+  skc_runtime_host_perm_free(impl->runtime,impl);
+}
+
+//
+//
+//
+
+skc_err
+skc_path_builder_cl_12_create(struct skc_context        * const context,
+                              struct skc_path_builder * * const path_builder)
+{
+  //
+  // retain the context
+  // skc_context_retain(context);
+  //
+  struct skc_runtime * const runtime = context->runtime;
+
+  // allocate path builder
+  (*path_builder)             = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**path_builder));
+
+  // init state
+  SKC_ASSERT_STATE_INIT((*path_builder),SKC_PATH_BUILDER_STATE_READY);
+
+  (*path_builder)->context    = context;
+
+  // save opaque impl-specific pointers
+  (*path_builder)->begin      = skc_path_builder_pfn_begin;
+  (*path_builder)->end        = skc_path_builder_pfn_end;
+  (*path_builder)->new_line   = skc_path_builder_pfn_new_line;
+  (*path_builder)->new_quad   = skc_path_builder_pfn_new_quad;
+  (*path_builder)->new_cubic  = skc_path_builder_pfn_new_cubic;
+  (*path_builder)->release    = skc_path_builder_pfn_release;
+
+  // initialize path builder counts
+  (*path_builder)->line.rem   = 0;
+  (*path_builder)->quad.rem   = 0;
+  (*path_builder)->cubic.rem  = 0;
+
+  (*path_builder)->refcount   = 1;
+
+  struct skc_path_builder_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl));
+
+  (*path_builder)->impl       = impl;
+
+  //
+  // init impl
+  //
+  impl->path_builder  = *path_builder;
+  impl->runtime       = runtime;
+
+  impl->cq            = skc_runtime_acquire_cq_in_order(runtime);
+
+  impl->kernels.alloc = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_ALLOC);
+  impl->kernels.copy  = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_COPY);
+
+  //
+  // FIXME -- let these config constants remain constant and in place
+  //
+  struct skc_config const * const config = runtime->config;
+
+  impl->ring.subbufs           = config->paths_copy.buffer.count;
+  impl->ring.blocks_per.buffer = config->paths_copy.subbuf.count * config->paths_copy.buffer.count;
+  impl->ring.blocks_per.subbuf = config->paths_copy.subbuf.count;
+  //
+  // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  //
+
+  cl_int cl_err;
+
+  // allocate large device-side extent for path data
+  impl->blocks.buffer   = clCreateBuffer(runtime->cl.context,
+                                         CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                         config->paths_copy.block.buffer, // FIXME -- either use config or local constants everywhere
+                                         NULL,&cl_err); cl_ok(cl_err);
+
+  // allocate small host-side array of pointers to mapped subbufs
+  impl->blocks.subbufs  = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
+                                                      impl->ring.subbufs *
+                                                      sizeof(*impl->blocks.subbufs));
+
+  // allocate large device-side extent for path copy commands
+  impl->cmds.buffer     = clCreateBuffer(runtime->cl.context,
+                                         CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                         config->paths_copy.command.buffer,
+                                         NULL,&cl_err); cl_ok(cl_err);
+
+  // allocate small host-side array of pointers to mapped subbufs
+  impl->cmds.subbufs    = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
+                                                      impl->ring.subbufs *
+                                                      sizeof(*impl->cmds.subbufs));
+
+  // allocate small host-side array of intervals of path handles
+  impl->release.records = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
+                                                      impl->ring.subbufs *
+                                                      sizeof(*impl->release.records));
+
+  // allocate large host-side array that is max # of path handles in flight
+  impl->release.paths   = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
+                                                      impl->ring.blocks_per.buffer *
+                                                      sizeof(*impl->release.paths));
+
+  // small scratch used by kernels
+  impl->reads           = clCreateBuffer(runtime->cl.context,
+                                         CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS,
+                                         sizeof(skc_uint) * impl->ring.subbufs,
+                                         NULL,&cl_err); cl_ok(cl_err);
+
+  // initialize release record with impl backpointer
+  for (skc_uint ii=0; ii<impl->ring.subbufs; ii++)
+    {
+      struct skc_release_record * record = impl->release.records + ii;
+
+      record->impl = impl;
+      record->grid = NULL;
+      record->from = record->to = ii * impl->ring.blocks_per.subbuf;
+    }
+
+  //
+  // allocate and map subbuffers -- we always check the command
+  // subbuffer's map/unmap events before touching it or its associated
+  // block subbuffer.
+  //
+  struct skc_subbuffer_blocks * sb = impl->blocks.subbufs;
+  struct skc_subbuffer_cmds   * sc = impl->cmds  .subbufs;
+
+  cl_buffer_region              rb = { 0, config->paths_copy.block.subbuf   };
+  cl_buffer_region              rc = { 0, config->paths_copy.command.subbuf };
+
+  // for each subbuffer
+  for (skc_uint ii=0; ii<config->paths_copy.buffer.count; ii++)
+    {
+      sb->device = clCreateSubBuffer(impl->blocks.buffer,
+                                     CL_MEM_HOST_WRITE_ONLY,
+                                     CL_BUFFER_CREATE_TYPE_REGION,
+                                     &rb,
+                                     &cl_err); cl_ok(cl_err);
+
+      sb->host   = clEnqueueMapBuffer(impl->cq,
+                                      sb->device,
+                                      CL_FALSE,
+                                      CL_MAP_WRITE_INVALIDATE_REGION,
+                                      0,rb.size,
+                                      0,NULL,NULL,
+                                      &cl_err); cl_ok(cl_err);
+
+      sc->device = clCreateSubBuffer(impl->cmds.buffer,
+                                     CL_MEM_HOST_WRITE_ONLY,
+                                     CL_BUFFER_CREATE_TYPE_REGION,
+                                     &rc,
+                                     &cl_err); cl_ok(cl_err);
+
+      sc->host   = clEnqueueMapBuffer(impl->cq,
+                                      sc->device,
+                                      CL_FALSE,
+                                      CL_MAP_WRITE_INVALIDATE_REGION,
+                                      0,rc.size,
+                                      0,NULL,&sc->map,
+                                      &cl_err); cl_ok(cl_err);
+      sb        += 1;
+      sc        += 1;
+
+      rb.origin += rb.size;
+      rc.origin += rc.size;
+    }
+
+  //
+  // initialize remaining members
+  //
+  impl->prev.from        = 0;
+  impl->prev.to          = 0;
+  impl->prev.rolling     = 0;
+
+  impl->curr.from        = 0;
+  impl->curr.to          = 0;
+
+  impl->wip.to           = 0;
+
+  impl->wip.head         = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes);
+  impl->wip.node         = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes);
+
+  impl->wip.rolling.one  = SKC_BLOCK_ID_TAG_COUNT * config->block.subblocks;
+  impl->wip.rolling.next = 0;
+
+  // for now, completely initialize builder before returning
+  cl(Finish(impl->cq));
+
+  return SKC_ERR_SUCCESS;
+}
+
+//
+//
+//