aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/compute/skc/platforms/cl_12/path_builder_cl_12.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/compute/skc/platforms/cl_12/path_builder_cl_12.c')
-rw-r--r--src/compute/skc/platforms/cl_12/path_builder_cl_12.c1443
1 files changed, 1443 insertions, 0 deletions
diff --git a/src/compute/skc/platforms/cl_12/path_builder_cl_12.c b/src/compute/skc/platforms/cl_12/path_builder_cl_12.c
new file mode 100644
index 0000000000..e915dffada
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/path_builder_cl_12.c
@@ -0,0 +1,1443 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <float.h>
+#include <stdio.h>
+
+#include "common/cl/assert_cl.h"
+
+#include "context.h"
+#include "handle.h"
+#include "grid.h"
+#include "path.h"
+#include "path_builder.h"
+
+#include "config_cl.h"
+#include "export_cl_12.h"
+#include "runtime_cl_12.h"
+#include "path_builder_cl_12.h"
+
+//
+// OpenCL 1.2 devices support mapping of buffers into the host address
+// space.
+//
+// Mapped buffers must be aligned on MIN_DATA_TYPE_ALIGN_SIZE bit
+// boundary (e.g. 128 bytes). This complicates coordinating sharing
+// of data between the host and the device.
+//
+// Some OpenCL 2.0 devices support fine-grained shared virtual memory
+// pointers with byte-addressing and allow simpler coordination
+// strategies at the cost of maintaining cache coherency.
+//
+// The path builder is focused on moving bulk path data from the host
+// into the device-managed "block" memory pool and arranging it into a
+// SIMT/SIMD-friendly data structure that can be efficiently read by
+// the rasterizer.
+//
+// Note that one simplifying assumption is that the maximum length of
+// a *single* path can't be larger than what fits in the single extent
+// (which is split into M subbuffers). This would be a very long path
+// and a legitimate size limitation.
+//
+// For some systems, it may be appropriate to never pull path data
+// into the device-managed block pool and instead present the path
+// data to the device in a temporarily available allocated memory
+// "zone" of paths that can be discarded all at once.
+//
+// For other systems, it may be appropriate to simply copy the path
+// data from host to device.
+//
+// But the majority of OpenCL (and VK, MTL, DX12) devices we'll be
+// targeting support basic map/unmap functionality similar to OpenCL
+// 1.2. Furthermore, not all OpenCL 2.0 devices support fine-grained
+// sharing of memory and still require a map/unmap step... but note
+// that they all support byte-aligned mapping and subbuffers.
+//
+// The general strategy that this particular CL_12 implementation uses
+// is to allocate a large mappable bulk-data path buffer and an
+// auxilary mappable command buffer.
+//
+// The buffers are split into a reasonable number of properly aligned
+// subbuffers to enable simultaneous host and device access.
+//
+
+//
+// Blocks:
+// 1 extent
+// M mapped subbuffers (configurable) to allow for concurrency
+//
+// Commands:
+// 1 extent
+// M mapped subbuffers (configurable) to allow for concurrency
+//
+// Spans:
+// M hi/lo structures
+//
+// { cl_sub, void*, event, base }
+//
+// - size of sub buffer
+// - remaining
+//
+// - counts
+//
+
+//
+// For any kernel launch, at most one path will be discontiguous and
+// defined across two sub-buffers.
+//
+// Nodes are updated locally until full and then stored so they will
+// never be incomplete. Headers are stored locally until the path is
+// ended so they will never be incomplete.
+//
+// A line, quad or cubic acquires 4/6/8 segments which may be spread
+// across one or more congtiguous blocks.
+//
+// If a flush() occurs then the remaining columns of multi-segment
+// paths are initialized with zero-length line, quad, cubic elements.
+//
+// Every block's command word has a type and a count acquired from a
+// rolling counter.
+//
+// The kernel is passed two spans of blocks { base, count } to
+// process. The grid is must process (lo.count + hi.count) blocks.
+//
+
+struct skc_subbuffer_blocks
+{
+ cl_mem device;
+ void * host;
+};
+
+struct skc_subbuffer_cmds
+{
+ cl_mem device;
+ void * host;
+ cl_event map;
+};
+
+//
+// ringdex is an index with range [0, blocks-per-subbuf * subbufs-per-buffer )
+//
+
+typedef skc_uint skc_ringdex_t;
+
+union skc_ringdex_expand
+{
+ div_t qr;
+
+ struct {
+#ifndef SKC_DIV_REM_BEFORE_QUOT // offsetof(div_t,quot) != 0
+ skc_uint subbuf;
+ skc_uint block;
+#else
+ skc_uint block;
+ skc_uint subbuf;
+#endif
+ };
+};
+
+//
+// this record is executed by the grid
+//
+
+struct skc_release_record
+{
+ struct skc_path_builder_impl * impl; // back pointer to impl
+
+ skc_grid_t grid; // pointer to scheduled grid
+
+ skc_uint from; // inclusive starting index : [from,to)
+ skc_uint to; // non-inclusive ending index : [from,to)
+};
+
+//
+//
+//
+
+struct skc_path_builder_impl
+{
+ struct skc_path_builder * path_builder;
+
+ struct skc_runtime * runtime;
+
+ cl_command_queue cq;
+
+ struct {
+ cl_kernel alloc;
+ cl_kernel copy;
+ } kernels;
+
+ //
+ // FIXME -- make this pointer to constant config
+ //
+ // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
+ struct {
+ skc_uint subbufs; // how many subbufs in the buffer?
+
+ struct {
+ skc_uint buffer; // how many blocks in the buffer?
+ skc_uint subbuf; // how many blocks in a subbuf?
+ } blocks_per;
+ } ring;
+ //
+ // ^^^^^^^^^^^ don't duplicate these constants ^^^^^^^^^^^^^^^^^^
+ //
+
+ struct {
+ cl_mem buffer; // backing buffer for blocks
+ struct skc_subbuffer_blocks * subbufs; // array of structures
+ } blocks;
+
+ struct {
+ cl_mem buffer; // backing buffer for commands
+ struct skc_subbuffer_cmds * subbufs; // array of structures
+ } cmds;
+
+ struct {
+ struct skc_release_record * records; // max release records is equal to max subbufs
+ skc_path_t * paths; // max paths is less than or equal to max commands
+ } release;
+
+ cl_mem reads; // each kernel only requires one word to store the block pool "base"
+
+ struct {
+ skc_uint rolling; // rolling counter used by cmds to map to block pool alloc
+ skc_ringdex_t from;
+ skc_ringdex_t to;
+ } prev;
+
+ struct {
+ skc_ringdex_t from;
+ skc_ringdex_t to;
+ } curr;
+
+ struct {
+ struct skc_path_head * head; // pointer to local path header -- not written until path end
+ struct skc_path_node * node; // pointer to local node -- may alias head until head is full
+
+ struct {
+ skc_uint rolling; // rolling counter of wip node -- valid after one node is allocated
+ union skc_tagged_block_id * next; // next slot in node -- may initially point to head.ids
+ skc_uint rem; // how many id slots left in node block
+ } ids;
+
+ struct {
+ skc_uint rem; // how many subblocks left in block?
+ skc_uint rolling; // rolling counter of block of subblocks
+ float * next; // next subblock in current subblock block
+ skc_uint idx; // index of next subblock
+ } subblocks;
+
+ struct {
+ skc_uint one; // .block = 1
+ skc_uint next; // rolling counter used by cmds to map to block pool alloc
+ } rolling;
+
+ skc_ringdex_t to; // ringdex of _next_available_ command/block in ring -- FIXME -- should be current
+ } wip;
+};
+
+//
+// FIXME -- move to a pow2 subbuffer size and dispense with division
+// and modulo operations
+//
+
+static
+union skc_ringdex_expand
+skc_ringdex_expand(struct skc_path_builder_impl * const impl,
+ skc_ringdex_t const ringdex)
+{
+ return (union skc_ringdex_expand){
+ .qr = div(ringdex,impl->ring.blocks_per.subbuf)
+ };
+}
+
+static
+void
+skc_ringdex_wip_to_block_inc(struct skc_path_builder_impl * const impl)
+{
+ //
+ // FIXME - which is faster?
+ //
+#if 1
+ impl->wip.to = (impl->wip.to + 1) % impl->ring.blocks_per.buffer;
+#else
+ impl->wip.to -= (impl->wip.to < impl->ring.blocks_per.buffer) ? -1 : impl->wip.to;
+#endif
+
+ // this path is too long -- for now assert() and die
+ assert(impl->wip.to != impl->curr.from);
+}
+
+static
+skc_ringdex_t
+skc_ringdex_span(struct skc_path_builder_impl * const impl,
+ skc_ringdex_t const from,
+ skc_ringdex_t const to)
+{
+ return (to - from) % impl->ring.blocks_per.buffer;
+}
+
+static
+void
+skc_ringdex_wip_to_subbuf_inc(struct skc_path_builder_impl * const impl)
+{
+ union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to);
+
+ // nothing to do if this is the first block in the subbuf
+ if (to.block == 0)
+ return;
+
+ skc_uint const new_subbuf = (to.subbuf + 1) % impl->ring.subbufs;
+
+ // otherwise increment and mod
+ impl->wip.to = new_subbuf * impl->ring.blocks_per.subbuf;
+}
+
+static
+skc_bool
+skc_ringdex_curr_is_equal(struct skc_path_builder_impl * const impl)
+{
+ return impl->curr.from == impl->curr.to;
+}
+
+static
+skc_bool
+skc_ringdex_prev_is_equal(struct skc_path_builder_impl * const impl)
+{
+ return impl->prev.from == impl->prev.to;
+}
+
+static
+skc_uint
+skc_ringdex_dont_map_last(struct skc_path_builder_impl * const impl,
+ skc_uint const to_block)
+{
+ // no blocks acquired OR this is last block in subbuf
+ return !((impl->wip.to == impl->curr.to) || (to_block == 0));
+}
+
+//
+//
+//
+
+static
+struct skc_release_record *
+skc_release_curr(struct skc_path_builder_impl * const impl)
+{
+ union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from);
+
+ return impl->release.records + curr_from.subbuf;
+}
+
+//
+// FIXME -- get rid of all distant config references -- grab them at all at creation time
+//
+
+static
+void
+skc_path_builder_pfn_begin(struct skc_path_builder_impl * const impl)
+{
+ // init header counters // { handle, blocks, nodes, prims }
+ impl->wip.head->header = (union skc_path_header){
+ .handle = 0,
+ .blocks = 0,
+ .nodes = 0,
+ .prims = 0
+ };
+
+ // FIXME -- BOUNDS SHOULD USE SIMD4 TRICK AND NEGATE ONE OF THE CORNERS
+ impl->wip.head->bounds = (union skc_path_bounds){ +FLT_MIN, +FLT_MIN, -FLT_MIN, -FLT_MIN };
+
+ // point wip ids at local head node
+ impl->wip.ids.next = impl->wip.head->tag_ids; // point to local head node
+ impl->wip.ids.rem = impl->runtime->config->block.words - SKC_PATH_HEAD_WORDS; // FIXME -- save this constant somewhere
+
+ // start with no subblocks
+ impl->wip.subblocks.rem = 0;
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_finalize_node(struct skc_path_builder_impl * const impl)
+{
+#if 1
+ //
+ // FIXME -- a Duff's device might be optimal here but would have to
+ // be customized per device since node's could be 16-128+ words
+ //
+ while (impl->wip.ids.rem > 0)
+ {
+ impl->wip.ids.rem -= 1;
+ impl->wip.ids.next->u32 = SKC_TAGGED_BLOCK_ID_INVALID;
+ impl->wip.ids.next += 1;
+ }
+#else
+ memset(&impl->wip.ids.next->u32,
+ SKC_TAGGED_BLOCK_ID_INVALID, // 0xFF
+ sizeof(impl->wip.ids.next->u32) * impl->wip.ids.rem);
+
+ impl->wip.ids.next += impl->wip.ids.rem;
+ impl->wip.ids.rem = 0;
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_zero_float(skc_float * p, skc_uint rem)
+{
+ memset(p,0,sizeof(*p)*rem);
+}
+
+static
+void
+skc_path_builder_finalize_subblocks(struct skc_path_builder * const path_builder)
+{
+ //
+ // FIXME -- it might be more performant to zero the remaining
+ // columns in a subblock -- a subblock at a time -- instead of the
+ // same column across all the subblocks
+ //
+#if 0
+ while (path_builder->line.rem > 0)
+ {
+ --path_builder->line.rem;
+
+ *path_builder->line.coords[0]++ = 0.0f;
+ *path_builder->line.coords[1]++ = 0.0f;
+ *path_builder->line.coords[2]++ = 0.0f;
+ *path_builder->line.coords[3]++ = 0.0f;
+ }
+
+ while (path_builder->quad.rem > 0)
+ {
+ --path_builder->quad.rem;
+
+ *path_builder->line.coords[0]++ = 0.0f;
+ *path_builder->line.coords[1]++ = 0.0f;
+ *path_builder->line.coords[2]++ = 0.0f;
+ *path_builder->line.coords[3]++ = 0.0f;
+ *path_builder->line.coords[4]++ = 0.0f;
+ *path_builder->line.coords[5]++ = 0.0f;
+ }
+
+ while (path_builder->cubic.rem > 0)
+ {
+ --path_builder->cubic.rem;
+
+ *path_builder->line.coords[0]++ = 0.0f;
+ *path_builder->line.coords[1]++ = 0.0f;
+ *path_builder->line.coords[2]++ = 0.0f;
+ *path_builder->line.coords[3]++ = 0.0f;
+ *path_builder->line.coords[4]++ = 0.0f;
+ *path_builder->line.coords[5]++ = 0.0f;
+ *path_builder->line.coords[6]++ = 0.0f;
+ *path_builder->line.coords[7]++ = 0.0f;
+ }
+#else
+ if (path_builder->line.rem > 0)
+ {
+ skc_zero_float(path_builder->line.coords[0],path_builder->line.rem);
+ skc_zero_float(path_builder->line.coords[1],path_builder->line.rem);
+ skc_zero_float(path_builder->line.coords[2],path_builder->line.rem);
+ skc_zero_float(path_builder->line.coords[3],path_builder->line.rem);
+
+ path_builder->line.rem = 0;
+ }
+
+ if (path_builder->quad.rem > 0)
+ {
+ skc_zero_float(path_builder->quad.coords[0],path_builder->quad.rem);
+ skc_zero_float(path_builder->quad.coords[1],path_builder->quad.rem);
+ skc_zero_float(path_builder->quad.coords[2],path_builder->quad.rem);
+ skc_zero_float(path_builder->quad.coords[3],path_builder->quad.rem);
+ skc_zero_float(path_builder->quad.coords[4],path_builder->quad.rem);
+ skc_zero_float(path_builder->quad.coords[5],path_builder->quad.rem);
+
+ path_builder->quad.rem = 0;
+ }
+
+ if (path_builder->cubic.rem > 0)
+ {
+ skc_zero_float(path_builder->cubic.coords[0],path_builder->cubic.rem);
+ skc_zero_float(path_builder->cubic.coords[1],path_builder->cubic.rem);
+ skc_zero_float(path_builder->cubic.coords[2],path_builder->cubic.rem);
+ skc_zero_float(path_builder->cubic.coords[3],path_builder->cubic.rem);
+ skc_zero_float(path_builder->cubic.coords[4],path_builder->cubic.rem);
+ skc_zero_float(path_builder->cubic.coords[5],path_builder->cubic.rem);
+ skc_zero_float(path_builder->cubic.coords[6],path_builder->cubic.rem);
+ skc_zero_float(path_builder->cubic.coords[7],path_builder->cubic.rem);
+
+ path_builder->cubic.rem = 0;
+ }
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_unmap(struct skc_path_builder_impl * const impl,
+ skc_uint from,
+ skc_uint to)
+{
+ // to might be out of range
+ to = to % impl->ring.subbufs;
+
+#if 0
+ fprintf(stderr,"unmap: [%2u,%2u)\n",from,to);
+#endif
+
+ while (from != to) // 'to' might be out of range
+ {
+ // bring 'from' back in range
+ from = from % impl->ring.subbufs;
+
+ struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from;
+ struct skc_subbuffer_cmds * const cmds = impl->cmds .subbufs + from;
+
+ cl(EnqueueUnmapMemObject(impl->cq,
+ blocks->device,
+ blocks->host,
+ 0,NULL,NULL));
+
+ cl(EnqueueUnmapMemObject(impl->cq,
+ cmds->device,
+ cmds->host,
+ 0,NULL,NULL));
+
+ // bring from back in range
+ from = ++from % impl->ring.subbufs;
+ }
+}
+
+//
+// FIXME -- reuse this in create()
+//
+
+static
+void
+skc_path_builder_impl_map(struct skc_path_builder_impl * const impl,
+ skc_uint from,
+ skc_uint to)
+{
+ // to might be out of range
+ to = to % impl->ring.subbufs;
+
+#if 0
+ fprintf(stderr," map: [%2u,%2u)\n",from,to);
+#endif
+
+ while (from != to)
+ {
+ cl_int cl_err;
+
+ struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from;
+ struct skc_subbuffer_cmds * const cmds = impl->cmds .subbufs + from;
+
+ blocks->host = clEnqueueMapBuffer(impl->cq,
+ blocks->device,
+ CL_FALSE,
+ CL_MAP_WRITE_INVALIDATE_REGION,
+ 0,impl->runtime->config->paths_copy.block.subbuf,
+ 0,NULL,NULL,
+ &cl_err); cl_ok(cl_err);
+
+ cl(ReleaseEvent(cmds->map));
+
+ cmds->host = clEnqueueMapBuffer(impl->cq,
+ cmds->device,
+ CL_FALSE,
+ CL_MAP_WRITE_INVALIDATE_REGION,
+ 0,impl->runtime->config->paths_copy.command.subbuf,
+ 0,NULL,&cmds->map,
+ &cl_err); cl_ok(cl_err);
+
+ // bring from back in range
+ from = ++from % impl->ring.subbufs;
+ }
+ //
+ // FIXME -- when we switch to out of order queues we'll need a barrier here
+ //
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_release_dispose(struct skc_release_record * const release,
+ struct skc_path_builder_impl * const impl)
+{
+ struct skc_runtime * runtime = impl->runtime;
+
+ if (release->from <= release->to) // no wrap
+ {
+ skc_path_t const * paths = impl->release.paths + release->from;
+ skc_uint count = release->to - release->from;
+
+ skc_grid_deps_unmap(runtime->deps,paths,count);
+ skc_runtime_path_device_release(runtime,paths,count);
+ }
+ else // from > to implies wrap
+ {
+ skc_path_t const * paths_lo = impl->release.paths + release->from;
+ skc_uint count_lo = impl->ring.blocks_per.buffer - release->from;
+
+ skc_grid_deps_unmap(runtime->deps,paths_lo,count_lo);
+ skc_runtime_path_device_release(runtime,paths_lo,count_lo);
+
+ skc_grid_deps_unmap(runtime->deps,impl->release.paths,release->to);
+ skc_runtime_path_device_release(runtime,impl->release.paths,release->to);
+ }
+
+ release->to = release->from;
+}
+
+static
+void
+skc_path_builder_grid_pfn_dispose(skc_grid_t const grid)
+{
+ struct skc_release_record * const release = skc_grid_get_data(grid);
+ struct skc_path_builder_impl * const impl = release->impl;
+
+ skc_path_builder_release_dispose(release,impl);
+}
+
+static
+void
+// skc_path_builder_complete(struct skc_release_record * const release)
+skc_path_builder_complete(skc_grid_t grid)
+{
+ //
+ // notify deps that this grid is complete enough for other grids to
+ // proceed
+ //
+ // the path builder still has some cleanup to do before all its
+ // resources can be reused
+ //
+ skc_grid_complete(grid);
+}
+
+static
+void
+skc_path_builder_paths_copy_cb(cl_event event, cl_int status, skc_grid_t grid)
+{
+ SKC_CL_CB(status);
+
+ struct skc_release_record * const release = skc_grid_get_data(grid);
+
+ SKC_SCHEDULER_SCHEDULE(release->impl->runtime->scheduler,skc_path_builder_complete,grid);
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_grid_pfn_waiting(skc_grid_t const grid)
+{
+ struct skc_release_record * const release = skc_grid_get_data(grid);
+ struct skc_path_builder_impl * const impl = release->impl;
+
+ // 1. flush incomplete subblocks of path elements
+ // 2. unmap subbuffer on cq.unmap
+ // 3. flush cq.unmap
+ // 4. launch kernel on cq.kernel but wait for unmap completion
+ // 5. flush cq.kernel
+ // 6. remap relevant subbuffers on cq.map but wait for kernel completion
+ // 7. flush cq.map
+
+ //
+ // FIXME -- can be smarter about flushing if the wip paths are not
+ // in the same subbuf as curr.to
+ //
+ // THIS IS IMPORTANT TO FIX
+ //
+
+ // flush incomplete subblocks
+ skc_path_builder_finalize_subblocks(impl->path_builder);
+
+ //
+ // get range of subbufs that need to be unmapped
+ //
+ // note that impl->prev subbufs have already been unmapped
+ //
+ union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from);
+ union skc_ringdex_expand curr_to = skc_ringdex_expand(impl,impl->curr.to);
+ skc_uint const is_partial = curr_to.block > 0;
+ skc_uint const unmap_to = curr_to.subbuf + is_partial;
+
+ //
+ // unmap all subbufs in range [from,to)
+ //
+ skc_path_builder_impl_unmap(impl,curr_from.subbuf,unmap_to);
+
+ //
+ // launch kernels
+ //
+ skc_uint const pb_prev_span = skc_ringdex_span(impl,impl->prev.from,impl->prev.to);
+ skc_uint const pb_curr_span = skc_ringdex_span(impl,impl->curr.from,impl->curr.to);
+ skc_uint const pb_cmds = pb_prev_span + pb_curr_span;
+
+ //
+ // 1) allocate blocks from pool
+ //
+
+ //
+ // FIXME -- pack integers into struct/vector
+ //
+ cl(SetKernelArg(impl->kernels.alloc,0,SKC_CL_ARG(impl->runtime->block_pool.atomics.drw)));
+ cl(SetKernelArg(impl->kernels.alloc,1,SKC_CL_ARG(impl->reads)));
+ cl(SetKernelArg(impl->kernels.alloc,2,SKC_CL_ARG(curr_from.subbuf)));
+ cl(SetKernelArg(impl->kernels.alloc,3,SKC_CL_ARG(pb_cmds)));
+
+ skc_device_enqueue_kernel(impl->runtime->device,
+ SKC_DEVICE_KERNEL_ID_PATHS_ALLOC,
+ impl->cq,
+ impl->kernels.alloc,
+ 1,
+ 0,NULL,NULL);
+
+ //
+ // 2) copy blocks from unmapped device-accessible memory
+ //
+
+ //
+ // FIXME -- pack integers into struct/vector and reduce 13 arguments down to 7
+ //
+ cl(SetKernelArg(impl->kernels.copy, 0,SKC_CL_ARG(impl->runtime->handle_pool.map.drw)));
+
+ cl(SetKernelArg(impl->kernels.copy, 1,SKC_CL_ARG(impl->runtime->block_pool.ids.drw)));
+ cl(SetKernelArg(impl->kernels.copy, 2,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw)));
+ cl(SetKernelArg(impl->kernels.copy, 3,SKC_CL_ARG(impl->runtime->block_pool.size->ring_mask)));
+
+ cl(SetKernelArg(impl->kernels.copy, 4,SKC_CL_ARG(impl->reads)));
+ cl(SetKernelArg(impl->kernels.copy, 5,SKC_CL_ARG(curr_from.subbuf)));
+
+ cl(SetKernelArg(impl->kernels.copy, 6,SKC_CL_ARG(impl->cmds.buffer)));
+ cl(SetKernelArg(impl->kernels.copy, 7,SKC_CL_ARG(impl->blocks.buffer)));
+
+ cl(SetKernelArg(impl->kernels.copy, 8,SKC_CL_ARG(impl->ring.blocks_per.buffer)));
+ cl(SetKernelArg(impl->kernels.copy, 9,SKC_CL_ARG(impl->prev.rolling)));
+
+ cl(SetKernelArg(impl->kernels.copy,10,SKC_CL_ARG(impl->prev.from)));
+ cl(SetKernelArg(impl->kernels.copy,11,SKC_CL_ARG(pb_prev_span)));
+ cl(SetKernelArg(impl->kernels.copy,12,SKC_CL_ARG(impl->curr.from)));
+
+ cl_event complete;
+
+ skc_device_enqueue_kernel(impl->runtime->device,
+ SKC_DEVICE_KERNEL_ID_PATHS_COPY,
+ impl->cq,
+ impl->kernels.copy,
+ pb_cmds,
+ 0,NULL,&complete);
+
+ // set a callback on completion
+ cl(SetEventCallback(complete,CL_COMPLETE,
+ skc_path_builder_paths_copy_cb,
+ grid));
+
+ // immediately release
+ cl(ReleaseEvent(complete));
+
+ //
+ // remap as many subbuffers as possible after the kernel completes
+ //
+ // note that remaps are async and enqueued on the same command queue
+ // as the kernel launch
+ //
+ // we can't remap subbuffers that are in the possibly empty range
+ //
+ // cases:
+ //
+ // - curr.to == wip.to which means no blocks have been acquired
+ // - curr.to points to first block in (next) subbuf
+ // - otherwise, wip acquired blocks in the curr.to subbuf
+ //
+ // check for these first 2 cases!
+ //
+ union skc_ringdex_expand const prev_from = skc_ringdex_expand(impl,impl->prev.from);
+ skc_uint const no_wip = impl->curr.to == impl->wip.to;
+ skc_uint map_to = curr_to.subbuf + (is_partial && no_wip);
+
+ // remap all subbufs in range [from,to)
+ skc_path_builder_impl_map(impl,prev_from.subbuf,map_to);
+
+ // flush command queue
+ cl(Flush(impl->cq));
+
+ // save rolling
+ impl->prev.rolling = impl->wip.rolling.next;
+
+ // update prev and curr
+ if (no_wip)
+ {
+ //
+ // if there was no wip then round up to the next subbuf
+ //
+ skc_ringdex_wip_to_subbuf_inc(impl);
+
+ //
+ // update prev/curr with with incremented wip
+ //
+ impl->prev.from = impl->prev.to = impl->wip.to;
+ impl->curr.from = impl->curr.to = impl->wip.to;
+ }
+ else
+ {
+ //
+ // update prev with wip partials
+ //
+ impl->prev.from = impl->curr.to;
+ impl->prev.to = impl->wip .to;
+
+ //
+ // start curr on a new subbuf boundary
+ //
+ skc_ringdex_wip_to_subbuf_inc(impl);
+
+ impl->curr.from = impl->wip.to;
+ impl->curr.to = impl->wip.to;
+ }
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_acquire_subbuffer(struct skc_path_builder_impl * const impl,
+ skc_uint const subbuf)
+{
+ //
+ // FIXME -- move to a power-of-two subbuf size and kickstart path
+ // copies as early as possible
+ //
+ // FIXME -- the subbufs "self-clock" (flow control) the kernel
+ // launches and accounting. Combine all the subbuffers and release
+ // records into a single indexable struct instead of 3.
+ //
+ struct skc_subbuffer_cmds * const sc = impl->cmds.subbufs + subbuf;
+ struct skc_release_record * const release = impl->release.records + subbuf;
+ struct skc_scheduler * const scheduler = impl->runtime->scheduler;
+
+ // can't proceed until the paths have been released
+ SKC_SCHEDULER_WAIT_WHILE(scheduler,release->from != release->to);
+
+ // throw in a scheduler yield ... FIXME -- get rid of
+ skc_scheduler_yield(scheduler);
+
+ // can't proceed until the subbuffer is mapped
+ cl(WaitForEvents(1,&sc->map));
+}
+
+//
+//
+//
+
+static
+union skc_ringdex_expand
+skc_path_builder_impl_acquire_block(struct skc_path_builder_impl * const impl)
+{
+ // break ringdex into components
+ union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to);
+
+ // does wip ringdex point to a new subbuffer?
+ if (to.block == 0)
+ {
+ // potentially spin/block waiting for subbuffer
+ skc_path_builder_impl_acquire_subbuffer(impl,to.subbuf);
+ }
+
+ // post increment wip.to
+ skc_ringdex_wip_to_block_inc(impl);
+
+ return to;
+}
+
+//
+//
+//
+
+static
+skc_uint
+skc_rolling_block(skc_uint const rolling, skc_uint const tag)
+{
+ return rolling | tag;
+}
+
+static
+skc_uint
+skc_rolling_subblock(skc_uint const rolling, skc_uint const subblock, skc_uint const tag)
+{
+ return rolling | (subblock << SKC_TAGGED_BLOCK_ID_BITS_TAG) | tag;
+}
+
+static
+void
+skc_rolling_inc(struct skc_path_builder_impl * const impl)
+{
+ impl->wip.rolling.next += impl->wip.rolling.one;
+}
+
+//
+//
+//
+
+static
+void *
+skc_path_builder_impl_new_command(struct skc_path_builder_impl * const impl,
+ skc_uint const rolling,
+ skc_cmd_paths_copy_tag const tag)
+{
+ // bump blocks count
+ impl->wip.head->header.blocks += 1;
+
+ // acquire a block
+ union skc_ringdex_expand const to = skc_path_builder_impl_acquire_block(impl);
+
+ // make a pointer
+ union skc_tagged_block_id * const cmds_subbuf = impl->cmds.subbufs[to.subbuf].host;
+
+ // store command for block
+ cmds_subbuf[to.block].u32 = skc_rolling_block(rolling,tag);
+
+#if 0
+ // store command for block
+ cmds_subbuf[to.block].u32 = skc_rolling_block(impl->wip.rolling.next,tag);
+
+ // increment rolling
+ skc_rolling_inc(impl);
+#endif
+
+ // return pointer to block
+ float * const blocks_subbuf = impl->blocks.subbufs[to.subbuf].host;
+
+ // FIXME -- make it easier to get config constant
+ return blocks_subbuf + (to.block * impl->runtime->config->block.words);
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_flush_node(struct skc_path_builder_impl * const impl)
+{
+ // store command to subbuf and get pointer to blocks subbuf
+ void * const block = skc_path_builder_impl_new_command(impl,impl->wip.ids.rolling,
+ SKC_CMD_PATHS_COPY_TAG_NODE);
+
+ // copy head to blocks subbuf -- write-only
+ memcpy(block,impl->wip.node,impl->runtime->config->block.bytes);
+}
+
+static
+void
+skc_path_builder_impl_flush_head(struct skc_path_builder_impl * const impl)
+{
+ // store command to subbuf and get pointer to blocks subbuf
+ void * const block = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next,
+ SKC_CMD_PATHS_COPY_TAG_HEAD);
+
+ // copy head to blocks subbuf -- write-only
+ memcpy(block,impl->wip.head,impl->runtime->config->block.bytes);
+
+ // increment rolling
+ skc_rolling_inc(impl);
+
+ // the 'to' index is non-inclusive so assign wip.to after flush_head
+ impl->curr.to = impl->wip.to;
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_new_node_block(struct skc_path_builder_impl * const impl)
+{
+ // update final block id in node
+ impl->wip.ids.next->u32 = skc_rolling_block(impl->wip.rolling.next,SKC_BLOCK_ID_TAG_PATH_NEXT);
+
+ // if wip.ids is not the header then flush now full wip node
+ if (impl->wip.head->header.nodes > 0)
+ skc_path_builder_impl_flush_node(impl);
+
+ // bump node count
+ impl->wip.head->header.nodes += 1;
+
+ // save current rolling
+ impl->wip.ids.rolling = impl->wip.rolling.next;
+
+ // increment rolling
+ skc_rolling_inc(impl);
+
+ // update wip.ids.*
+ impl->wip.ids.next = impl->wip.node->tag_ids;
+ impl->wip.ids.rem = impl->runtime->config->block.words;
+}
+
+static
+void
+skc_path_builder_impl_new_segs_block(struct skc_path_builder_impl * const impl)
+{
+ impl->wip.subblocks.rem = impl->runtime->config->block.subblocks; // FIXME -- move constants closer to structure
+ impl->wip.subblocks.rolling = impl->wip.rolling.next;
+ impl->wip.subblocks.next = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next,
+ SKC_CMD_PATHS_COPY_TAG_SEGS);
+ impl->wip.subblocks.idx = 0;
+
+ // increment rolling
+ skc_rolling_inc(impl);
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_impl_acquire_subblocks(struct skc_path_builder_impl * const impl,
+ skc_block_id_tag tag,
+ skc_uint vertices,
+ float * * subblocks)
+{
+ //
+ // FIRST TAG RECORDS THE ELEMENT TYPE
+ //
+ while (true)
+ {
+ // if only one block id left in node then acquire new node block
+ // and append its block id as with a next tag
+ if (impl->wip.ids.rem == 1)
+ skc_path_builder_impl_new_node_block(impl);
+
+ // if zero subblocks left then acquire a new subblock block and
+ // append its block id
+ if (impl->wip.subblocks.rem == 0)
+ skc_path_builder_impl_new_segs_block(impl);
+
+ // save first command -- tag and subblocks may have been updated
+ impl->wip.ids.next->u32 = skc_rolling_subblock(impl->wip.subblocks.rolling,impl->wip.subblocks.idx,tag);
+
+ // increment node block subblock pointer
+ impl->wip.ids.next += 1;
+ impl->wip.ids.rem -= 1;
+
+ // how many vertices can we store
+ skc_uint rem = min(vertices,impl->wip.subblocks.rem);
+
+ // decrement vertices
+ vertices -= rem;
+ impl->wip.subblocks.rem -= rem;
+ impl->wip.subblocks.idx += rem;
+
+ // assign subblocks
+ do {
+ *subblocks++ = impl->wip.subblocks.next;
+ impl->wip.subblocks.next += impl->runtime->config->subblock.words;
+ // FIXME -- move constants closer to structure
+ } while (--rem > 0);
+
+ // anything left to do?
+ if (vertices == 0)
+ break;
+
+ // any tag after this will be a caboose command
+ tag = SKC_BLOCK_ID_TAG_PATH_NEXT;
+ }
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_pfn_end(struct skc_path_builder_impl * const impl, skc_path_t * const path)
+{
+ // finalize incomplete active subblocks -- we don't care about any
+ // remaining unused subblocks in block
+ skc_path_builder_finalize_subblocks(impl->path_builder);
+
+ // mark remaining wips.ids in the head or node as invalid
+ skc_path_builder_impl_finalize_node(impl);
+
+ // flush node if rem > 0 and node is not actually head
+ if (impl->wip.head->header.nodes >= 1)
+ skc_path_builder_impl_flush_node(impl);
+
+ // acquire path host id
+ *path = skc_runtime_handle_device_acquire(impl->runtime); // FIXME -- MAY WANT TO GRAB AN ID ON BEGIN
+
+ // save path host handle
+ impl->wip.head->header.handle = *path;
+
+ // flush head -- acquires a block and bumps head->header.blocks
+ skc_path_builder_impl_flush_head(impl);
+
+ // get current release
+ struct skc_release_record * const release = skc_release_curr(impl);
+
+ // acquire grid if null
+ if (release->grid == NULL)
+ {
+ release->grid =
+ SKC_GRID_DEPS_ATTACH(impl->runtime->deps,
+ &release->grid, // NULL on start/force
+ release, // data payload
+ skc_path_builder_grid_pfn_waiting,
+ NULL, // no execute pfn
+ skc_path_builder_grid_pfn_dispose);
+ }
+
+ // update grid map
+ skc_grid_map(release->grid,*path);
+
+ // update path release
+ impl->release.paths[release->to] = *path;
+
+ // increment release.to
+ release->to = (release->to + 1) % impl->ring.blocks_per.buffer;
+
+ // add guard bit
+ *path |= SKC_TYPED_HANDLE_TYPE_IS_PATH;
+
+#if 1
+ //
+ // eager kernel launch?
+ //
+ {
+ union skc_ringdex_expand const curr_from = skc_ringdex_expand(impl,impl->curr.from);
+ union skc_ringdex_expand const curr_to = skc_ringdex_expand(impl,impl->curr.to);
+
+ if (curr_from.subbuf != curr_to.subbuf)
+ {
+ skc_grid_start(release->grid);
+ // skc_scheduler_yield(impl->runtime->scheduler);
+ }
+ }
+#endif
+}
+
+//
+// FIXME -- clean up accessing of CONFIG constants in these 3 routines
+//
+
+static
+void
+skc_path_builder_pfn_new_line(struct skc_path_builder_impl * const impl)
+{
+ // acquire subblock pointers
+ skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_LINE,4,
+ impl->path_builder->line.coords);
+
+ // increment line count
+ impl->wip.head->header.prims += 1;
+
+ // update rem_count_xxx count
+ impl->path_builder->line.rem = impl->runtime->config->subblock.words;
+}
+
+static
+void
+skc_path_builder_pfn_new_quad(struct skc_path_builder_impl * const impl)
+{
+ // acquire subblock pointers
+ skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_QUAD,6,
+ impl->path_builder->quad.coords);
+
+ // increment line count
+ impl->wip.head->header.prims += 1;
+
+ // update rem_count_xxx count
+ impl->path_builder->quad.rem = impl->runtime->config->subblock.words;
+}
+
+static
+void
+skc_path_builder_pfn_new_cubic(struct skc_path_builder_impl * const impl)
+{
+ // acquire subblock pointers
+ skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_CUBIC,8,
+ impl->path_builder->cubic.coords);
+
+ // increment line count
+ impl->wip.head->header.prims += 1;
+
+ // update rem_count_xxx count
+ impl->path_builder->cubic.rem = impl->runtime->config->subblock.words;
+}
+
+//
+//
+//
+
+static
+void
+skc_path_builder_pfn_release(struct skc_path_builder_impl * const impl)
+{
+ // decrement reference count
+ if (--impl->path_builder->refcount != 0)
+ return;
+
+ //
+ // otherwise, dispose of everything
+ //
+ struct skc_runtime * const runtime = impl->runtime;
+
+ // free path builder
+ skc_runtime_host_perm_free(impl->runtime,impl->path_builder);
+
+ // release cq
+ skc_runtime_release_cq_in_order(runtime,impl->cq);
+
+ // release kernels
+ cl(ReleaseKernel(impl->kernels.alloc));
+ cl(ReleaseKernel(impl->kernels.copy));
+
+ // free blocks extents
+ cl(ReleaseMemObject(impl->blocks.buffer));
+ skc_runtime_host_perm_free(runtime,impl->blocks.subbufs);
+
+ cl(ReleaseMemObject(impl->cmds.buffer));
+ skc_runtime_host_perm_free(runtime,impl->cmds.subbufs);
+
+ // free records
+ skc_runtime_host_perm_free(runtime,impl->release.records);
+ skc_runtime_host_perm_free(runtime,impl->release.paths);
+
+ // release staging head and node
+ skc_runtime_host_perm_free(runtime,impl->wip.head);
+ skc_runtime_host_perm_free(runtime,impl->wip.node);
+
+ // release reads scratch array
+ cl(ReleaseMemObject(impl->reads));
+
+ // for all subbuffers
+ // unmap subbuffer
+ // release subbuffer
+ // printf("%s not releasing subbuffers\n",__func__);
+
+ skc_runtime_host_perm_free(impl->runtime,impl);
+}
+
+//
+//
+//
+
+skc_err
+skc_path_builder_cl_12_create(struct skc_context * const context,
+ struct skc_path_builder * * const path_builder)
+{
+ //
+ // retain the context
+ // skc_context_retain(context);
+ //
+ struct skc_runtime * const runtime = context->runtime;
+
+ // allocate path builder
+ (*path_builder) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**path_builder));
+
+ // init state
+ SKC_ASSERT_STATE_INIT((*path_builder),SKC_PATH_BUILDER_STATE_READY);
+
+ (*path_builder)->context = context;
+
+ // save opaque impl-specific pointers
+ (*path_builder)->begin = skc_path_builder_pfn_begin;
+ (*path_builder)->end = skc_path_builder_pfn_end;
+ (*path_builder)->new_line = skc_path_builder_pfn_new_line;
+ (*path_builder)->new_quad = skc_path_builder_pfn_new_quad;
+ (*path_builder)->new_cubic = skc_path_builder_pfn_new_cubic;
+ (*path_builder)->release = skc_path_builder_pfn_release;
+
+ // initialize path builder counts
+ (*path_builder)->line.rem = 0;
+ (*path_builder)->quad.rem = 0;
+ (*path_builder)->cubic.rem = 0;
+
+ (*path_builder)->refcount = 1;
+
+ struct skc_path_builder_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl));
+
+ (*path_builder)->impl = impl;
+
+ //
+ // init impl
+ //
+ impl->path_builder = *path_builder;
+ impl->runtime = runtime;
+
+ impl->cq = skc_runtime_acquire_cq_in_order(runtime);
+
+ impl->kernels.alloc = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_ALLOC);
+ impl->kernels.copy = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_COPY);
+
+ //
+ // FIXME -- let these config constants remain constant and in place
+ //
+ struct skc_config const * const config = runtime->config;
+
+ impl->ring.subbufs = config->paths_copy.buffer.count;
+ impl->ring.blocks_per.buffer = config->paths_copy.subbuf.count * config->paths_copy.buffer.count;
+ impl->ring.blocks_per.subbuf = config->paths_copy.subbuf.count;
+ //
+ // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ //
+
+ cl_int cl_err;
+
+ // allocate large device-side extent for path data
+ impl->blocks.buffer = clCreateBuffer(runtime->cl.context,
+ CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+ config->paths_copy.block.buffer, // FIXME -- either use config or local constants everywhere
+ NULL,&cl_err); cl_ok(cl_err);
+
+ // allocate small host-side array of pointers to mapped subbufs
+ impl->blocks.subbufs = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
+ impl->ring.subbufs *
+ sizeof(*impl->blocks.subbufs));
+
+ // allocate large device-side extent for path copy commands
+ impl->cmds.buffer = clCreateBuffer(runtime->cl.context,
+ CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+ config->paths_copy.command.buffer,
+ NULL,&cl_err); cl_ok(cl_err);
+
+ // allocate small host-side array of pointers to mapped subbufs
+ impl->cmds.subbufs = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
+ impl->ring.subbufs *
+ sizeof(*impl->cmds.subbufs));
+
+ // allocate small host-side array of intervals of path handles
+ impl->release.records = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
+ impl->ring.subbufs *
+ sizeof(*impl->release.records));
+
+ // allocate large host-side array that is max # of path handles in flight
+ impl->release.paths = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
+ impl->ring.blocks_per.buffer *
+ sizeof(*impl->release.paths));
+
+ // small scratch used by kernels
+ impl->reads = clCreateBuffer(runtime->cl.context,
+ CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS,
+ sizeof(skc_uint) * impl->ring.subbufs,
+ NULL,&cl_err); cl_ok(cl_err);
+
+ // initialize release record with impl backpointer
+ for (skc_uint ii=0; ii<impl->ring.subbufs; ii++)
+ {
+ struct skc_release_record * record = impl->release.records + ii;
+
+ record->impl = impl;
+ record->grid = NULL;
+ record->from = record->to = ii * impl->ring.blocks_per.subbuf;
+ }
+
+ //
+ // allocate and map subbuffers -- we always check the command
+ // subbuffer's map/unmap events before touching it or its associated
+ // block subbuffer.
+ //
+ struct skc_subbuffer_blocks * sb = impl->blocks.subbufs;
+ struct skc_subbuffer_cmds * sc = impl->cmds .subbufs;
+
+ cl_buffer_region rb = { 0, config->paths_copy.block.subbuf };
+ cl_buffer_region rc = { 0, config->paths_copy.command.subbuf };
+
+ // for each subbuffer
+ for (skc_uint ii=0; ii<config->paths_copy.buffer.count; ii++)
+ {
+ sb->device = clCreateSubBuffer(impl->blocks.buffer,
+ CL_MEM_HOST_WRITE_ONLY,
+ CL_BUFFER_CREATE_TYPE_REGION,
+ &rb,
+ &cl_err); cl_ok(cl_err);
+
+ sb->host = clEnqueueMapBuffer(impl->cq,
+ sb->device,
+ CL_FALSE,
+ CL_MAP_WRITE_INVALIDATE_REGION,
+ 0,rb.size,
+ 0,NULL,NULL,
+ &cl_err); cl_ok(cl_err);
+
+ sc->device = clCreateSubBuffer(impl->cmds.buffer,
+ CL_MEM_HOST_WRITE_ONLY,
+ CL_BUFFER_CREATE_TYPE_REGION,
+ &rc,
+ &cl_err); cl_ok(cl_err);
+
+ sc->host = clEnqueueMapBuffer(impl->cq,
+ sc->device,
+ CL_FALSE,
+ CL_MAP_WRITE_INVALIDATE_REGION,
+ 0,rc.size,
+ 0,NULL,&sc->map,
+ &cl_err); cl_ok(cl_err);
+ sb += 1;
+ sc += 1;
+
+ rb.origin += rb.size;
+ rc.origin += rc.size;
+ }
+
+ //
+ // initialize remaining members
+ //
+ impl->prev.from = 0;
+ impl->prev.to = 0;
+ impl->prev.rolling = 0;
+
+ impl->curr.from = 0;
+ impl->curr.to = 0;
+
+ impl->wip.to = 0;
+
+ impl->wip.head = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes);
+ impl->wip.node = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes);
+
+ impl->wip.rolling.one = SKC_BLOCK_ID_TAG_COUNT * config->block.subblocks;
+ impl->wip.rolling.next = 0;
+
+ // for now, completely initialize builder before returning
+ cl(Finish(impl->cq));
+
+ return SKC_ERR_SUCCESS;
+}
+
+//
+//
+//