/*
 * Copyright 2017 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can
 * be found in the LICENSE file.
 *
 */

//
//
//

#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <float.h>
#include <stdio.h>

#include "common/cl/assert_cl.h"

#include "context.h"
#include "handle.h"
#include "grid.h"
#include "path.h"
#include "path_builder.h"

#include "config_cl.h"
#include "export_cl_12.h"
#include "runtime_cl_12.h"
#include "path_builder_cl_12.h"

//
// OpenCL 1.2 devices support mapping of buffers into the host address
// space.
//
// Mapped buffers must be aligned on MIN_DATA_TYPE_ALIGN_SIZE bit
// boundary (e.g. 128 bytes).  This complicates coordinating sharing
// of data between the host and the device.
//
// Some OpenCL 2.0 devices support fine-grained shared virtual memory
// pointers with byte-addressing and allow simpler coordination
// strategies at the cost of maintaining cache coherency.
//
// The path builder is focused on moving bulk path data from the host
// into the device-managed "block" memory pool and arranging it into a
// SIMT/SIMD-friendly data structure that can be efficiently read by
// the rasterizer.
//
// Note that one simplifying assumption is that the maximum length of
// a *single* path can't be larger than what fits in the single extent
// (which is split into M subbuffers).  This would be a very long path
// and a legitimate size limitation.
//
// For some systems, it may be appropriate to never pull path data
// into the device-managed block pool and instead present the path
// data to the device in a temporarily available allocated memory
// "zone" of paths that can be discarded all at once.
//
// For other systems, it may be appropriate to simply copy the path
// data from host to device.
//
// But the majority of OpenCL (and VK, MTL, DX12) devices we'll be
// targeting support basic map/unmap functionality similar to OpenCL
// 1.2.  Furthermore, not all OpenCL 2.0 devices support fine-grained
// sharing of memory and still require a map/unmap step... but note
// that they all support byte-aligned mapping and subbuffers.
//
// The general strategy that this particular CL_12 implementation uses
// is to allocate a large mappable bulk-data path buffer and an
// auxilary mappable command buffer.
//
// The buffers are split into a reasonable number of properly aligned
// subbuffers to enable simultaneous host and device access.
//

//
// Blocks:
//   1 extent
//   M mapped subbuffers (configurable) to allow for concurrency
//
// Commands:
//   1 extent
//   M mapped subbuffers (configurable) to allow for concurrency
//
// Spans:
//   M hi/lo structures
//
// { cl_sub, void*, event, base }
//
// - size of sub buffer
// - remaining
//
// - counts
//

//
// For any kernel launch, at most one path will be discontiguous and
// defined across two sub-buffers.
//
// Nodes are updated locally until full and then stored so they will
// never be incomplete.  Headers are stored locally until the path is
// ended so they will never be incomplete.
//
// A line, quad or cubic acquires 4/6/8 segments which may be spread
// across one or more congtiguous blocks.
//
// If a flush() occurs then the remaining columns of multi-segment
// paths are initialized with zero-length line, quad, cubic elements.
//
// Every block's command word has a type and a count acquired from a
// rolling counter.
//
// The kernel is passed two spans of blocks { base, count } to
// process.  The grid is must process (lo.count + hi.count) blocks.
//

struct skc_subbuffer_blocks
{
  cl_mem   device;
  void *   host;
};

struct skc_subbuffer_cmds
{
  cl_mem   device;
  void *   host;
  cl_event map;
};

//
// ringdex is an index with range [0, blocks-per-subbuf * subbufs-per-buffer )
//

typedef skc_uint skc_ringdex_t;

union skc_ringdex_expand
{
  div_t      qr;

  struct {
#ifndef SKC_DIV_REM_BEFORE_QUOT // offsetof(div_t,quot) != 0
    skc_uint subbuf;
    skc_uint block;
#else
    skc_uint block;
    skc_uint subbuf;
#endif
  };
};

//
// this record is executed by the grid
//

struct skc_release_record
{
  struct skc_path_builder_impl * impl; // back pointer to impl

  skc_grid_t                     grid; // pointer to scheduled grid

  skc_uint                       from; // inclusive starting index   : [from,to)
  skc_uint                       to;   // non-inclusive ending index : [from,to)
};

//
//
//

struct skc_path_builder_impl
{
  struct skc_path_builder       * path_builder;

  struct skc_runtime            * runtime;

  cl_command_queue                cq;

  struct {
    cl_kernel                     alloc;
    cl_kernel                     copy;
  } kernels;

  //
  // FIXME -- make this pointer to constant config
  //
  // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
  struct {
    skc_uint                      subbufs;  // how many subbufs in the buffer?

    struct {
      skc_uint                    buffer;   // how many blocks in the buffer?
      skc_uint                    subbuf;   // how many blocks in a   subbuf?
    } blocks_per;
  } ring;
  //
  // ^^^^^^^^^^^ don't duplicate these constants ^^^^^^^^^^^^^^^^^^
  //

  struct {
    cl_mem                        buffer;   // backing buffer for blocks
    struct skc_subbuffer_blocks * subbufs;  // array of structures
  } blocks;

  struct {
    cl_mem                        buffer;   // backing buffer for commands
    struct skc_subbuffer_cmds   * subbufs;  // array of structures
  } cmds;

  struct {
    struct skc_release_record   * records;  // max release records is equal to max subbufs
    skc_path_t                  * paths;    // max paths is less than or equal to max commands
  } release;

  cl_mem                          reads;    // each kernel only requires one word to store the block pool "base"

  struct {
    skc_uint                      rolling;  // rolling counter used by cmds to map to block pool alloc
    skc_ringdex_t                 from;
    skc_ringdex_t                 to;
  } prev;

  struct {
    skc_ringdex_t                 from;
    skc_ringdex_t                 to;
  } curr;

  struct {
    struct skc_path_head        * head;     // pointer to local path header -- not written until path end
    struct skc_path_node        * node;     // pointer to local node -- may alias head until head is full

    struct {
      skc_uint                    rolling;  // rolling counter of wip node -- valid after one node is allocated
      union skc_tagged_block_id * next;     // next slot in node -- may initially point to head.ids
      skc_uint                    rem;      // how many id slots left in node block
    } ids;

    struct {
      skc_uint                    rem;      // how many subblocks left in block?
      skc_uint                    rolling;  // rolling counter of block of subblocks
      float                     * next;     // next subblock in current subblock block
      skc_uint                    idx;      // index of next subblock
    } subblocks;

    struct {
      skc_uint                    one;      // .block = 1
      skc_uint                    next;     // rolling counter used by cmds to map to block pool alloc
    } rolling;

    skc_ringdex_t                 to;       // ringdex of _next_available_ command/block in ring -- FIXME -- should be current
  } wip;
};

//
// FIXME -- move to a pow2 subbuffer size and dispense with division
// and modulo operations
//

static
union skc_ringdex_expand
skc_ringdex_expand(struct skc_path_builder_impl * const impl,
                   skc_ringdex_t                  const ringdex)
{
  return (union skc_ringdex_expand){
    .qr = div(ringdex,impl->ring.blocks_per.subbuf)
  };
}

static
void
skc_ringdex_wip_to_block_inc(struct skc_path_builder_impl * const impl)
{
  //
  // FIXME - which is faster?
  //
#if 1
  impl->wip.to  = (impl->wip.to + 1) % impl->ring.blocks_per.buffer;
#else
  impl->wip.to -= (impl->wip.to < impl->ring.blocks_per.buffer) ? -1 : impl->wip.to;
#endif

  // this path is too long -- for now assert() and die
  assert(impl->wip.to != impl->curr.from);
}

static
skc_ringdex_t
skc_ringdex_span(struct skc_path_builder_impl * const impl,
                 skc_ringdex_t                  const from,
                 skc_ringdex_t                  const to)
{
  return (to - from) % impl->ring.blocks_per.buffer;
}

static
void
skc_ringdex_wip_to_subbuf_inc(struct skc_path_builder_impl * const impl)
{
  union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to);

  // nothing to do if this is the first block in the subbuf
  if (to.block == 0)
    return;

  skc_uint const new_subbuf = (to.subbuf + 1) % impl->ring.subbufs;

  // otherwise increment and mod
  impl->wip.to = new_subbuf * impl->ring.blocks_per.subbuf;
}

static
skc_bool
skc_ringdex_curr_is_equal(struct skc_path_builder_impl * const impl)
{
  return impl->curr.from == impl->curr.to;
}

static
skc_bool
skc_ringdex_prev_is_equal(struct skc_path_builder_impl * const impl)
{
  return impl->prev.from == impl->prev.to;
}

static
skc_uint
skc_ringdex_dont_map_last(struct skc_path_builder_impl * const impl,
                          skc_uint                       const to_block)
{
  // no blocks acquired OR this is last block in subbuf
  return !((impl->wip.to == impl->curr.to) || (to_block == 0));
}

//
//
//

static
struct skc_release_record *
skc_release_curr(struct skc_path_builder_impl * const impl)
{
  union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from);

  return impl->release.records + curr_from.subbuf;
}

//
// FIXME -- get rid of all distant config references -- grab them at all at creation time
//

static
void
skc_path_builder_pfn_begin(struct skc_path_builder_impl * const impl)
{
  // init header counters // { handle, blocks, nodes, prims }
  impl->wip.head->header = (union skc_path_header){
    .handle = 0,
    .blocks = 0,
    .nodes  = 0,
    .prims  = 0
  };

  // FIXME -- BOUNDS SHOULD USE SIMD4 TRICK AND NEGATE ONE OF THE CORNERS
  impl->wip.head->bounds  = (union skc_path_bounds){ +FLT_MIN, +FLT_MIN, -FLT_MIN, -FLT_MIN };

  // point wip ids at local head node
  impl->wip.ids.next      = impl->wip.head->tag_ids; // point to local head node
  impl->wip.ids.rem       = impl->runtime->config->block.words - SKC_PATH_HEAD_WORDS; // FIXME -- save this constant somewhere

  // start with no subblocks
  impl->wip.subblocks.rem = 0;
}

//
//
//

static
void
skc_path_builder_impl_finalize_node(struct skc_path_builder_impl * const impl)
{
#if 1
  //
  // FIXME -- a Duff's device might be optimal here but would have to
  // be customized per device since node's could be 16-128+ words
  //
  while (impl->wip.ids.rem > 0)
    {
      impl->wip.ids.rem      -= 1;
      impl->wip.ids.next->u32 = SKC_TAGGED_BLOCK_ID_INVALID;
      impl->wip.ids.next     += 1;
    }
#else
  memset(&impl->wip.ids.next->u32,
         SKC_TAGGED_BLOCK_ID_INVALID, // 0xFF
         sizeof(impl->wip.ids.next->u32) * impl->wip.ids.rem);

  impl->wip.ids.next += impl->wip.ids.rem;
  impl->wip.ids.rem   = 0;
#endif
}

//
//
//

static
void
skc_zero_float(skc_float * p, skc_uint rem)
{
  memset(p,0,sizeof(*p)*rem);
}

static
void
skc_path_builder_finalize_subblocks(struct skc_path_builder * const path_builder)
{
  //
  // FIXME -- it might be more performant to zero the remaining
  // columns in a subblock -- a subblock at a time -- instead of the
  // same column across all the subblocks
  //
#if 0
  while (path_builder->line.rem > 0)
    {
      --path_builder->line.rem;

      *path_builder->line.coords[0]++ = 0.0f;
      *path_builder->line.coords[1]++ = 0.0f;
      *path_builder->line.coords[2]++ = 0.0f;
      *path_builder->line.coords[3]++ = 0.0f;
    }

  while (path_builder->quad.rem > 0)
    {
      --path_builder->quad.rem;

      *path_builder->line.coords[0]++ = 0.0f;
      *path_builder->line.coords[1]++ = 0.0f;
      *path_builder->line.coords[2]++ = 0.0f;
      *path_builder->line.coords[3]++ = 0.0f;
      *path_builder->line.coords[4]++ = 0.0f;
      *path_builder->line.coords[5]++ = 0.0f;
    }

  while (path_builder->cubic.rem > 0)
    {
      --path_builder->cubic.rem;

      *path_builder->line.coords[0]++ = 0.0f;
      *path_builder->line.coords[1]++ = 0.0f;
      *path_builder->line.coords[2]++ = 0.0f;
      *path_builder->line.coords[3]++ = 0.0f;
      *path_builder->line.coords[4]++ = 0.0f;
      *path_builder->line.coords[5]++ = 0.0f;
      *path_builder->line.coords[6]++ = 0.0f;
      *path_builder->line.coords[7]++ = 0.0f;
    }
#else
  if (path_builder->line.rem > 0)
    {
      skc_zero_float(path_builder->line.coords[0],path_builder->line.rem);
      skc_zero_float(path_builder->line.coords[1],path_builder->line.rem);
      skc_zero_float(path_builder->line.coords[2],path_builder->line.rem);
      skc_zero_float(path_builder->line.coords[3],path_builder->line.rem);

      path_builder->line.rem = 0;
    }

  if (path_builder->quad.rem > 0)
    {
      skc_zero_float(path_builder->quad.coords[0],path_builder->quad.rem);
      skc_zero_float(path_builder->quad.coords[1],path_builder->quad.rem);
      skc_zero_float(path_builder->quad.coords[2],path_builder->quad.rem);
      skc_zero_float(path_builder->quad.coords[3],path_builder->quad.rem);
      skc_zero_float(path_builder->quad.coords[4],path_builder->quad.rem);
      skc_zero_float(path_builder->quad.coords[5],path_builder->quad.rem);

      path_builder->quad.rem = 0;
    }

  if (path_builder->cubic.rem > 0)
    {
      skc_zero_float(path_builder->cubic.coords[0],path_builder->cubic.rem);
      skc_zero_float(path_builder->cubic.coords[1],path_builder->cubic.rem);
      skc_zero_float(path_builder->cubic.coords[2],path_builder->cubic.rem);
      skc_zero_float(path_builder->cubic.coords[3],path_builder->cubic.rem);
      skc_zero_float(path_builder->cubic.coords[4],path_builder->cubic.rem);
      skc_zero_float(path_builder->cubic.coords[5],path_builder->cubic.rem);
      skc_zero_float(path_builder->cubic.coords[6],path_builder->cubic.rem);
      skc_zero_float(path_builder->cubic.coords[7],path_builder->cubic.rem);

      path_builder->cubic.rem = 0;
    }
#endif
}

//
//
//

static
void
skc_path_builder_impl_unmap(struct skc_path_builder_impl * const impl,
                            skc_uint                             from,
                            skc_uint                             to)
{
  // to might be out of range
  to = to % impl->ring.subbufs;

#if 0
  fprintf(stderr,"unmap: [%2u,%2u)\n",from,to);
#endif

  while (from != to) // 'to' might be out of range
    {
      // bring 'from' back in range
      from = from % impl->ring.subbufs;

      struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from;
      struct skc_subbuffer_cmds   * const cmds   = impl->cmds  .subbufs + from;

      cl(EnqueueUnmapMemObject(impl->cq,
                               blocks->device,
                               blocks->host,
                               0,NULL,NULL));

      cl(EnqueueUnmapMemObject(impl->cq,
                               cmds->device,
                               cmds->host,
                               0,NULL,NULL));

      // bring from back in range
      from = ++from % impl->ring.subbufs;
    }
}

//
// FIXME -- reuse this in create()
//

static
void
skc_path_builder_impl_map(struct skc_path_builder_impl * const impl,
                          skc_uint                             from,
                          skc_uint                             to)
{
  // to might be out of range
  to = to % impl->ring.subbufs;

#if 0
  fprintf(stderr,"  map: [%2u,%2u)\n",from,to);
#endif

  while (from != to)
    {
      cl_int cl_err;

      struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from;
      struct skc_subbuffer_cmds   * const cmds   = impl->cmds  .subbufs + from;

      blocks->host = clEnqueueMapBuffer(impl->cq,
                                        blocks->device,
                                        CL_FALSE,
                                        CL_MAP_WRITE_INVALIDATE_REGION,
                                        0,impl->runtime->config->paths_copy.block.subbuf,
                                        0,NULL,NULL,
                                        &cl_err); cl_ok(cl_err);

      cl(ReleaseEvent(cmds->map));

      cmds->host   = clEnqueueMapBuffer(impl->cq,
                                        cmds->device,
                                        CL_FALSE,
                                        CL_MAP_WRITE_INVALIDATE_REGION,
                                        0,impl->runtime->config->paths_copy.command.subbuf,
                                        0,NULL,&cmds->map,
                                        &cl_err); cl_ok(cl_err);

      // bring from back in range
      from = ++from % impl->ring.subbufs;
    }
  //
  // FIXME -- when we switch to out of order queues we'll need a barrier here
  //
}

//
//
//

static
void
skc_path_builder_release_dispose(struct skc_release_record    * const release,
                                 struct skc_path_builder_impl * const impl)
{
  struct skc_runtime * runtime = impl->runtime;

  if (release->from <= release->to) // no wrap
    {
      skc_path_t const * paths = impl->release.paths + release->from;
      skc_uint           count = release->to         - release->from;

      skc_grid_deps_unmap(runtime->deps,paths,count);
      skc_runtime_path_device_release(runtime,paths,count);
    }
  else // from > to implies wrap
    {
      skc_path_t const * paths_lo = impl->release.paths + release->from;
      skc_uint           count_lo = impl->ring.blocks_per.buffer - release->from;

      skc_grid_deps_unmap(runtime->deps,paths_lo,count_lo);
      skc_runtime_path_device_release(runtime,paths_lo,count_lo);

      skc_grid_deps_unmap(runtime->deps,impl->release.paths,release->to);
      skc_runtime_path_device_release(runtime,impl->release.paths,release->to);
    }

  release->to = release->from;
}

static
void
skc_path_builder_grid_pfn_dispose(skc_grid_t const grid)
{
  struct skc_release_record    * const release = skc_grid_get_data(grid);
  struct skc_path_builder_impl * const impl    = release->impl;

  skc_path_builder_release_dispose(release,impl);
}

static
void
// skc_path_builder_complete(struct skc_release_record * const release)
skc_path_builder_complete(skc_grid_t grid)
{
  //
  // notify deps that this grid is complete enough for other grids to
  // proceed
  //
  // the path builder still has some cleanup to do before all its
  // resources can be reused
  //
  skc_grid_complete(grid);
}

static
void
skc_path_builder_paths_copy_cb(cl_event event, cl_int status, skc_grid_t grid)
{
  SKC_CL_CB(status);

  struct skc_release_record * const release = skc_grid_get_data(grid);

  SKC_SCHEDULER_SCHEDULE(release->impl->runtime->scheduler,skc_path_builder_complete,grid);
}

//
//
//

static
void
skc_path_builder_grid_pfn_waiting(skc_grid_t const grid)
{
  struct skc_release_record    * const release = skc_grid_get_data(grid);
  struct skc_path_builder_impl * const impl    = release->impl;

  // 1. flush incomplete subblocks of path elements
  // 2. unmap subbuffer on cq.unmap
  // 3. flush cq.unmap
  // 4. launch kernel on cq.kernel but wait for unmap completion
  // 5. flush cq.kernel
  // 6. remap relevant subbuffers on cq.map but wait for kernel completion
  // 7. flush cq.map

  //
  // FIXME -- can be smarter about flushing if the wip paths are not
  // in the same subbuf as curr.to
  //
  // THIS IS IMPORTANT TO FIX
  //

  // flush incomplete subblocks
  skc_path_builder_finalize_subblocks(impl->path_builder);

  //
  // get range of subbufs that need to be unmapped
  //
  // note that impl->prev subbufs have already been unmapped
  //
  union skc_ringdex_expand       curr_from  = skc_ringdex_expand(impl,impl->curr.from);
  union skc_ringdex_expand       curr_to    = skc_ringdex_expand(impl,impl->curr.to);
  skc_uint                 const is_partial = curr_to.block > 0;
  skc_uint                 const unmap_to   = curr_to.subbuf + is_partial;

  //
  // unmap all subbufs in range [from,to)
  //
  skc_path_builder_impl_unmap(impl,curr_from.subbuf,unmap_to);

  //
  // launch kernels
  //
  skc_uint const pb_prev_span = skc_ringdex_span(impl,impl->prev.from,impl->prev.to);
  skc_uint const pb_curr_span = skc_ringdex_span(impl,impl->curr.from,impl->curr.to);
  skc_uint const pb_cmds      = pb_prev_span + pb_curr_span;

  //
  // 1) allocate blocks from pool
  //

  //
  // FIXME -- pack integers into struct/vector
  //
  cl(SetKernelArg(impl->kernels.alloc,0,SKC_CL_ARG(impl->runtime->block_pool.atomics.drw)));
  cl(SetKernelArg(impl->kernels.alloc,1,SKC_CL_ARG(impl->reads)));
  cl(SetKernelArg(impl->kernels.alloc,2,SKC_CL_ARG(curr_from.subbuf)));
  cl(SetKernelArg(impl->kernels.alloc,3,SKC_CL_ARG(pb_cmds)));

  skc_device_enqueue_kernel(impl->runtime->device,
                            SKC_DEVICE_KERNEL_ID_PATHS_ALLOC,
                            impl->cq,
                            impl->kernels.alloc,
                            1,
                            0,NULL,NULL);

  //
  // 2) copy blocks from unmapped device-accessible memory
  //

  //
  // FIXME -- pack integers into struct/vector and reduce 13 arguments down to 7
  //
  cl(SetKernelArg(impl->kernels.copy, 0,SKC_CL_ARG(impl->runtime->handle_pool.map.drw)));

  cl(SetKernelArg(impl->kernels.copy, 1,SKC_CL_ARG(impl->runtime->block_pool.ids.drw)));
  cl(SetKernelArg(impl->kernels.copy, 2,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw)));
  cl(SetKernelArg(impl->kernels.copy, 3,SKC_CL_ARG(impl->runtime->block_pool.size->ring_mask)));

  cl(SetKernelArg(impl->kernels.copy, 4,SKC_CL_ARG(impl->reads)));
  cl(SetKernelArg(impl->kernels.copy, 5,SKC_CL_ARG(curr_from.subbuf)));

  cl(SetKernelArg(impl->kernels.copy, 6,SKC_CL_ARG(impl->cmds.buffer)));
  cl(SetKernelArg(impl->kernels.copy, 7,SKC_CL_ARG(impl->blocks.buffer)));

  cl(SetKernelArg(impl->kernels.copy, 8,SKC_CL_ARG(impl->ring.blocks_per.buffer)));
  cl(SetKernelArg(impl->kernels.copy, 9,SKC_CL_ARG(impl->prev.rolling)));

  cl(SetKernelArg(impl->kernels.copy,10,SKC_CL_ARG(impl->prev.from)));
  cl(SetKernelArg(impl->kernels.copy,11,SKC_CL_ARG(pb_prev_span)));
  cl(SetKernelArg(impl->kernels.copy,12,SKC_CL_ARG(impl->curr.from)));

  cl_event complete;

  skc_device_enqueue_kernel(impl->runtime->device,
                            SKC_DEVICE_KERNEL_ID_PATHS_COPY,
                            impl->cq,
                            impl->kernels.copy,
                            pb_cmds,
                            0,NULL,&complete);

  // set a callback on completion
  cl(SetEventCallback(complete,CL_COMPLETE,
                      skc_path_builder_paths_copy_cb,
                      grid));

  // immediately release
  cl(ReleaseEvent(complete));

  //
  // remap as many subbuffers as possible after the kernel completes
  //
  // note that remaps are async and enqueued on the same command queue
  // as the kernel launch
  //
  // we can't remap subbuffers that are in the possibly empty range
  //
  // cases:
  //
  //   - curr.to == wip.to which means no blocks have been acquired
  //   - curr.to points to first block in (next) subbuf
  //   - otherwise, wip acquired blocks in the curr.to subbuf
  //
  // check for these first 2 cases!
  //
  union skc_ringdex_expand const prev_from = skc_ringdex_expand(impl,impl->prev.from);
  skc_uint                 const no_wip    = impl->curr.to == impl->wip.to;
  skc_uint                       map_to    = curr_to.subbuf + (is_partial && no_wip);

  // remap all subbufs in range [from,to)
  skc_path_builder_impl_map(impl,prev_from.subbuf,map_to);

  // flush command queue
  cl(Flush(impl->cq));

  // save rolling
  impl->prev.rolling = impl->wip.rolling.next;

  // update prev and curr
  if (no_wip)
    {
      //
      // if there was no wip then round up to the next subbuf
      //
      skc_ringdex_wip_to_subbuf_inc(impl);

      //
      // update prev/curr with with incremented wip
      //
      impl->prev.from = impl->prev.to = impl->wip.to;
      impl->curr.from = impl->curr.to = impl->wip.to;
    }
  else
    {
      //
      // update prev with wip partials
      //
      impl->prev.from    = impl->curr.to;
      impl->prev.to      = impl->wip .to;

      //
      // start curr on a new subbuf boundary
      //
      skc_ringdex_wip_to_subbuf_inc(impl);

      impl->curr.from    = impl->wip.to;
      impl->curr.to      = impl->wip.to;
    }
}

//
//
//

static
void
skc_path_builder_impl_acquire_subbuffer(struct skc_path_builder_impl * const impl,
                                        skc_uint                       const subbuf)
{
  //
  // FIXME -- move to a power-of-two subbuf size and kickstart path
  // copies as early as possible
  //
  // FIXME -- the subbufs "self-clock" (flow control) the kernel
  // launches and accounting.  Combine all the subbuffers and release
  // records into a single indexable struct instead of 3.
  //
  struct skc_subbuffer_cmds * const sc        = impl->cmds.subbufs    + subbuf;
  struct skc_release_record * const release   = impl->release.records + subbuf;
  struct skc_scheduler      * const scheduler = impl->runtime->scheduler;

  // can't proceed until the paths have been released
  SKC_SCHEDULER_WAIT_WHILE(scheduler,release->from != release->to);

  // throw in a scheduler yield ... FIXME -- get rid of
  skc_scheduler_yield(scheduler);

  // can't proceed until the subbuffer is mapped
  cl(WaitForEvents(1,&sc->map));
}

//
//
//

static
union skc_ringdex_expand
skc_path_builder_impl_acquire_block(struct skc_path_builder_impl * const impl)
{
  // break ringdex into components
  union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to);

  // does wip ringdex point to a new subbuffer?
  if (to.block == 0)
    {
      // potentially spin/block waiting for subbuffer
      skc_path_builder_impl_acquire_subbuffer(impl,to.subbuf);
    }

  // post increment wip.to
  skc_ringdex_wip_to_block_inc(impl);

  return to;
}

//
//
//

static
skc_uint
skc_rolling_block(skc_uint const rolling, skc_uint const tag)
{
  return rolling | tag;
}

static
skc_uint
skc_rolling_subblock(skc_uint const rolling, skc_uint const subblock, skc_uint const tag)
{
  return rolling | (subblock << SKC_TAGGED_BLOCK_ID_BITS_TAG) | tag;
}

static
void
skc_rolling_inc(struct skc_path_builder_impl * const impl)
{
  impl->wip.rolling.next += impl->wip.rolling.one;
}

//
//
//

static
void *
skc_path_builder_impl_new_command(struct skc_path_builder_impl * const impl,
                                  skc_uint                       const rolling,
                                  skc_cmd_paths_copy_tag         const tag)
{
  // bump blocks count
  impl->wip.head->header.blocks += 1;

  // acquire a block
  union skc_ringdex_expand    const to          = skc_path_builder_impl_acquire_block(impl);

  // make a pointer
  union skc_tagged_block_id * const cmds_subbuf = impl->cmds.subbufs[to.subbuf].host;

  // store command for block
  cmds_subbuf[to.block].u32 = skc_rolling_block(rolling,tag);

#if 0
  // store command for block
  cmds_subbuf[to.block].u32 = skc_rolling_block(impl->wip.rolling.next,tag);

  // increment rolling
  skc_rolling_inc(impl);
#endif

  // return pointer to block
  float * const blocks_subbuf = impl->blocks.subbufs[to.subbuf].host;

  // FIXME -- make it easier to get config constant
  return blocks_subbuf + (to.block * impl->runtime->config->block.words);
}

//
//
//

static
void
skc_path_builder_impl_flush_node(struct skc_path_builder_impl * const impl)
{
  // store command to subbuf and get pointer to blocks subbuf
  void * const block = skc_path_builder_impl_new_command(impl,impl->wip.ids.rolling,
                                                         SKC_CMD_PATHS_COPY_TAG_NODE);

  // copy head to blocks subbuf -- write-only
  memcpy(block,impl->wip.node,impl->runtime->config->block.bytes);
}

static
void
skc_path_builder_impl_flush_head(struct skc_path_builder_impl * const impl)
{
  // store command to subbuf and get pointer to blocks subbuf
  void * const block = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next,
                                                         SKC_CMD_PATHS_COPY_TAG_HEAD);

  // copy head to blocks subbuf -- write-only
  memcpy(block,impl->wip.head,impl->runtime->config->block.bytes);

  // increment rolling
  skc_rolling_inc(impl);

  // the 'to' index is non-inclusive so assign wip.to after flush_head
  impl->curr.to = impl->wip.to;
}

//
//
//

static
void
skc_path_builder_impl_new_node_block(struct skc_path_builder_impl * const impl)
{
  // update final block id in node
  impl->wip.ids.next->u32 = skc_rolling_block(impl->wip.rolling.next,SKC_BLOCK_ID_TAG_PATH_NEXT);

  // if wip.ids is not the header then flush now full wip node
  if (impl->wip.head->header.nodes > 0)
    skc_path_builder_impl_flush_node(impl);

  // bump node count
  impl->wip.head->header.nodes += 1;

  // save current rolling
  impl->wip.ids.rolling = impl->wip.rolling.next;

  // increment rolling
  skc_rolling_inc(impl);

  // update wip.ids.*
  impl->wip.ids.next = impl->wip.node->tag_ids;
  impl->wip.ids.rem  = impl->runtime->config->block.words;
}

static
void
skc_path_builder_impl_new_segs_block(struct skc_path_builder_impl * const impl)
{
  impl->wip.subblocks.rem     = impl->runtime->config->block.subblocks; // FIXME -- move constants closer to structure
  impl->wip.subblocks.rolling = impl->wip.rolling.next;
  impl->wip.subblocks.next    = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next,
                                                                  SKC_CMD_PATHS_COPY_TAG_SEGS);
  impl->wip.subblocks.idx     = 0;

  // increment rolling
  skc_rolling_inc(impl);
}

//
//
//

static
void
skc_path_builder_impl_acquire_subblocks(struct skc_path_builder_impl * const impl,
                                        skc_block_id_tag                     tag,
                                        skc_uint                             vertices,
                                        float * *                            subblocks)
{
  //
  // FIRST TAG RECORDS THE ELEMENT TYPE
  //
  while (true)
    {
      // if only one block id left in node then acquire new node block
      // and append its block id as with a next tag
      if (impl->wip.ids.rem == 1)
        skc_path_builder_impl_new_node_block(impl);

      // if zero subblocks left then acquire a new subblock block and
      // append its block id
      if (impl->wip.subblocks.rem == 0)
        skc_path_builder_impl_new_segs_block(impl);

      // save first command -- tag and subblocks may have been updated
      impl->wip.ids.next->u32 = skc_rolling_subblock(impl->wip.subblocks.rolling,impl->wip.subblocks.idx,tag);

      // increment node block subblock pointer
      impl->wip.ids.next += 1;
      impl->wip.ids.rem  -= 1;

      // how many vertices can we store
      skc_uint rem = min(vertices,impl->wip.subblocks.rem);

      // decrement vertices
      vertices                -= rem;
      impl->wip.subblocks.rem -= rem;
      impl->wip.subblocks.idx += rem;

      // assign subblocks
      do {
        *subblocks++              = impl->wip.subblocks.next;
        impl->wip.subblocks.next += impl->runtime->config->subblock.words;
        // FIXME -- move constants closer to structure
      } while (--rem > 0);

      // anything left to do?
      if (vertices == 0)
        break;

      // any tag after this will be a caboose command
      tag = SKC_BLOCK_ID_TAG_PATH_NEXT;
    }
}

//
//
//

static
void
skc_path_builder_pfn_end(struct skc_path_builder_impl * const impl, skc_path_t * const path)
{
  // finalize incomplete active subblocks -- we don't care about any
  // remaining unused subblocks in block
  skc_path_builder_finalize_subblocks(impl->path_builder);

  // mark remaining wips.ids in the head or node as invalid
  skc_path_builder_impl_finalize_node(impl);

  // flush node if rem > 0 and node is not actually head
  if (impl->wip.head->header.nodes >= 1)
    skc_path_builder_impl_flush_node(impl);

  // acquire path host id
  *path = skc_runtime_handle_device_acquire(impl->runtime); // FIXME -- MAY WANT TO GRAB AN ID ON BEGIN

  // save path host handle
  impl->wip.head->header.handle = *path;

  // flush head -- acquires a block and bumps head->header.blocks
  skc_path_builder_impl_flush_head(impl);

  // get current release
  struct skc_release_record * const release = skc_release_curr(impl);

  // acquire grid if null
  if (release->grid == NULL)
    {
      release->grid =
        SKC_GRID_DEPS_ATTACH(impl->runtime->deps,
                             &release->grid, // NULL on start/force
                             release,        // data payload
                             skc_path_builder_grid_pfn_waiting,
                             NULL,           // no execute pfn
                             skc_path_builder_grid_pfn_dispose);
    }

  // update grid map
  skc_grid_map(release->grid,*path);

  // update path release
  impl->release.paths[release->to] = *path;

  // increment release.to
  release->to = (release->to + 1) % impl->ring.blocks_per.buffer;

  // add guard bit
  *path |= SKC_TYPED_HANDLE_TYPE_IS_PATH;

#if 1
  //
  // eager kernel launch?
  //
  {
    union skc_ringdex_expand const curr_from = skc_ringdex_expand(impl,impl->curr.from);
    union skc_ringdex_expand const curr_to   = skc_ringdex_expand(impl,impl->curr.to);

    if (curr_from.subbuf != curr_to.subbuf)
      {
        skc_grid_start(release->grid);
        // skc_scheduler_yield(impl->runtime->scheduler);
      }
  }
#endif
}

//
// FIXME -- clean up accessing of CONFIG constants in these 3 routines
//

static
void
skc_path_builder_pfn_new_line(struct skc_path_builder_impl * const impl)
{
  // acquire subblock pointers
  skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_LINE,4,
                                          impl->path_builder->line.coords);

  // increment line count
  impl->wip.head->header.prims += 1;

  // update rem_count_xxx count
  impl->path_builder->line.rem = impl->runtime->config->subblock.words;
}

static
void
skc_path_builder_pfn_new_quad(struct skc_path_builder_impl * const impl)
{
  // acquire subblock pointers
  skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_QUAD,6,
                                          impl->path_builder->quad.coords);

  // increment line count
  impl->wip.head->header.prims += 1;

  // update rem_count_xxx count
  impl->path_builder->quad.rem = impl->runtime->config->subblock.words;
}

static
void
skc_path_builder_pfn_new_cubic(struct skc_path_builder_impl * const impl)
{
  // acquire subblock pointers
  skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_CUBIC,8,
                                          impl->path_builder->cubic.coords);

  // increment line count
  impl->wip.head->header.prims += 1;

  // update rem_count_xxx count
  impl->path_builder->cubic.rem = impl->runtime->config->subblock.words;
}

//
//
//

static
void
skc_path_builder_pfn_release(struct skc_path_builder_impl * const impl)
{
  // decrement reference count
  if (--impl->path_builder->refcount != 0)
    return;

  //
  // otherwise, dispose of everything
  //
  struct skc_runtime * const runtime = impl->runtime;

  // free path builder
  skc_runtime_host_perm_free(impl->runtime,impl->path_builder);

  // release cq
  skc_runtime_release_cq_in_order(runtime,impl->cq);

  // release kernels
  cl(ReleaseKernel(impl->kernels.alloc));
  cl(ReleaseKernel(impl->kernels.copy));

  // free blocks extents
  cl(ReleaseMemObject(impl->blocks.buffer));
  skc_runtime_host_perm_free(runtime,impl->blocks.subbufs);

  cl(ReleaseMemObject(impl->cmds.buffer));
  skc_runtime_host_perm_free(runtime,impl->cmds.subbufs);

  // free records
  skc_runtime_host_perm_free(runtime,impl->release.records);
  skc_runtime_host_perm_free(runtime,impl->release.paths);

  // release staging head and node
  skc_runtime_host_perm_free(runtime,impl->wip.head);
  skc_runtime_host_perm_free(runtime,impl->wip.node);

  // release reads scratch array
  cl(ReleaseMemObject(impl->reads));

  // for all subbuffers
  //   unmap   subbuffer
  //   release subbuffer
  // printf("%s not releasing subbuffers\n",__func__);

  skc_runtime_host_perm_free(impl->runtime,impl);
}

//
//
//

skc_err
skc_path_builder_cl_12_create(struct skc_context        * const context,
                              struct skc_path_builder * * const path_builder)
{
  //
  // retain the context
  // skc_context_retain(context);
  //
  struct skc_runtime * const runtime = context->runtime;

  // allocate path builder
  (*path_builder)             = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**path_builder));

  // init state
  SKC_ASSERT_STATE_INIT((*path_builder),SKC_PATH_BUILDER_STATE_READY);

  (*path_builder)->context    = context;

  // save opaque impl-specific pointers
  (*path_builder)->begin      = skc_path_builder_pfn_begin;
  (*path_builder)->end        = skc_path_builder_pfn_end;
  (*path_builder)->new_line   = skc_path_builder_pfn_new_line;
  (*path_builder)->new_quad   = skc_path_builder_pfn_new_quad;
  (*path_builder)->new_cubic  = skc_path_builder_pfn_new_cubic;
  (*path_builder)->release    = skc_path_builder_pfn_release;

  // initialize path builder counts
  (*path_builder)->line.rem   = 0;
  (*path_builder)->quad.rem   = 0;
  (*path_builder)->cubic.rem  = 0;

  (*path_builder)->refcount   = 1;

  struct skc_path_builder_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl));

  (*path_builder)->impl       = impl;

  //
  // init impl
  //
  impl->path_builder  = *path_builder;
  impl->runtime       = runtime;

  impl->cq            = skc_runtime_acquire_cq_in_order(runtime);

  impl->kernels.alloc = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_ALLOC);
  impl->kernels.copy  = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_COPY);

  //
  // FIXME -- let these config constants remain constant and in place
  //
  struct skc_config const * const config = runtime->config;

  impl->ring.subbufs           = config->paths_copy.buffer.count;
  impl->ring.blocks_per.buffer = config->paths_copy.subbuf.count * config->paths_copy.buffer.count;
  impl->ring.blocks_per.subbuf = config->paths_copy.subbuf.count;
  //
  // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  //

  cl_int cl_err;

  // allocate large device-side extent for path data
  impl->blocks.buffer   = clCreateBuffer(runtime->cl.context,
                                         CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                                         config->paths_copy.block.buffer, // FIXME -- either use config or local constants everywhere
                                         NULL,&cl_err); cl_ok(cl_err);

  // allocate small host-side array of pointers to mapped subbufs
  impl->blocks.subbufs  = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
                                                      impl->ring.subbufs *
                                                      sizeof(*impl->blocks.subbufs));

  // allocate large device-side extent for path copy commands
  impl->cmds.buffer     = clCreateBuffer(runtime->cl.context,
                                         CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                                         config->paths_copy.command.buffer,
                                         NULL,&cl_err); cl_ok(cl_err);

  // allocate small host-side array of pointers to mapped subbufs
  impl->cmds.subbufs    = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
                                                      impl->ring.subbufs *
                                                      sizeof(*impl->cmds.subbufs));

  // allocate small host-side array of intervals of path handles
  impl->release.records = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
                                                      impl->ring.subbufs *
                                                      sizeof(*impl->release.records));

  // allocate large host-side array that is max # of path handles in flight
  impl->release.paths   = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
                                                      impl->ring.blocks_per.buffer *
                                                      sizeof(*impl->release.paths));

  // small scratch used by kernels
  impl->reads           = clCreateBuffer(runtime->cl.context,
                                         CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS,
                                         sizeof(skc_uint) * impl->ring.subbufs,
                                         NULL,&cl_err); cl_ok(cl_err);

  // initialize release record with impl backpointer
  for (skc_uint ii=0; ii<impl->ring.subbufs; ii++)
    {
      struct skc_release_record * record = impl->release.records + ii;

      record->impl = impl;
      record->grid = NULL;
      record->from = record->to = ii * impl->ring.blocks_per.subbuf;
    }

  //
  // allocate and map subbuffers -- we always check the command
  // subbuffer's map/unmap events before touching it or its associated
  // block subbuffer.
  //
  struct skc_subbuffer_blocks * sb = impl->blocks.subbufs;
  struct skc_subbuffer_cmds   * sc = impl->cmds  .subbufs;

  cl_buffer_region              rb = { 0, config->paths_copy.block.subbuf   };
  cl_buffer_region              rc = { 0, config->paths_copy.command.subbuf };

  // for each subbuffer
  for (skc_uint ii=0; ii<config->paths_copy.buffer.count; ii++)
    {
      sb->device = clCreateSubBuffer(impl->blocks.buffer,
                                     CL_MEM_HOST_WRITE_ONLY,
                                     CL_BUFFER_CREATE_TYPE_REGION,
                                     &rb,
                                     &cl_err); cl_ok(cl_err);

      sb->host   = clEnqueueMapBuffer(impl->cq,
                                      sb->device,
                                      CL_FALSE,
                                      CL_MAP_WRITE_INVALIDATE_REGION,
                                      0,rb.size,
                                      0,NULL,NULL,
                                      &cl_err); cl_ok(cl_err);

      sc->device = clCreateSubBuffer(impl->cmds.buffer,
                                     CL_MEM_HOST_WRITE_ONLY,
                                     CL_BUFFER_CREATE_TYPE_REGION,
                                     &rc,
                                     &cl_err); cl_ok(cl_err);

      sc->host   = clEnqueueMapBuffer(impl->cq,
                                      sc->device,
                                      CL_FALSE,
                                      CL_MAP_WRITE_INVALIDATE_REGION,
                                      0,rc.size,
                                      0,NULL,&sc->map,
                                      &cl_err); cl_ok(cl_err);
      sb        += 1;
      sc        += 1;

      rb.origin += rb.size;
      rc.origin += rc.size;
    }

  //
  // initialize remaining members
  //
  impl->prev.from        = 0;
  impl->prev.to          = 0;
  impl->prev.rolling     = 0;

  impl->curr.from        = 0;
  impl->curr.to          = 0;

  impl->wip.to           = 0;

  impl->wip.head         = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes);
  impl->wip.node         = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes);

  impl->wip.rolling.one  = SKC_BLOCK_ID_TAG_COUNT * config->block.subblocks;
  impl->wip.rolling.next = 0;

  // for now, completely initialize builder before returning
  cl(Finish(impl->cq));

  return SKC_ERR_SUCCESS;
}

//
//
//