diff options
Diffstat (limited to 'src/compute/skc/platforms/cl_12/path_builder_cl_12.c')
-rw-r--r-- | src/compute/skc/platforms/cl_12/path_builder_cl_12.c | 1443 |
1 files changed, 1443 insertions, 0 deletions
diff --git a/src/compute/skc/platforms/cl_12/path_builder_cl_12.c b/src/compute/skc/platforms/cl_12/path_builder_cl_12.c new file mode 100644 index 0000000000..e915dffada --- /dev/null +++ b/src/compute/skc/platforms/cl_12/path_builder_cl_12.c @@ -0,0 +1,1443 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <float.h> +#include <stdio.h> + +#include "common/cl/assert_cl.h" + +#include "context.h" +#include "handle.h" +#include "grid.h" +#include "path.h" +#include "path_builder.h" + +#include "config_cl.h" +#include "export_cl_12.h" +#include "runtime_cl_12.h" +#include "path_builder_cl_12.h" + +// +// OpenCL 1.2 devices support mapping of buffers into the host address +// space. +// +// Mapped buffers must be aligned on MIN_DATA_TYPE_ALIGN_SIZE bit +// boundary (e.g. 128 bytes). This complicates coordinating sharing +// of data between the host and the device. +// +// Some OpenCL 2.0 devices support fine-grained shared virtual memory +// pointers with byte-addressing and allow simpler coordination +// strategies at the cost of maintaining cache coherency. +// +// The path builder is focused on moving bulk path data from the host +// into the device-managed "block" memory pool and arranging it into a +// SIMT/SIMD-friendly data structure that can be efficiently read by +// the rasterizer. +// +// Note that one simplifying assumption is that the maximum length of +// a *single* path can't be larger than what fits in the single extent +// (which is split into M subbuffers). This would be a very long path +// and a legitimate size limitation. +// +// For some systems, it may be appropriate to never pull path data +// into the device-managed block pool and instead present the path +// data to the device in a temporarily available allocated memory +// "zone" of paths that can be discarded all at once. +// +// For other systems, it may be appropriate to simply copy the path +// data from host to device. +// +// But the majority of OpenCL (and VK, MTL, DX12) devices we'll be +// targeting support basic map/unmap functionality similar to OpenCL +// 1.2. Furthermore, not all OpenCL 2.0 devices support fine-grained +// sharing of memory and still require a map/unmap step... but note +// that they all support byte-aligned mapping and subbuffers. +// +// The general strategy that this particular CL_12 implementation uses +// is to allocate a large mappable bulk-data path buffer and an +// auxilary mappable command buffer. +// +// The buffers are split into a reasonable number of properly aligned +// subbuffers to enable simultaneous host and device access. +// + +// +// Blocks: +// 1 extent +// M mapped subbuffers (configurable) to allow for concurrency +// +// Commands: +// 1 extent +// M mapped subbuffers (configurable) to allow for concurrency +// +// Spans: +// M hi/lo structures +// +// { cl_sub, void*, event, base } +// +// - size of sub buffer +// - remaining +// +// - counts +// + +// +// For any kernel launch, at most one path will be discontiguous and +// defined across two sub-buffers. +// +// Nodes are updated locally until full and then stored so they will +// never be incomplete. Headers are stored locally until the path is +// ended so they will never be incomplete. +// +// A line, quad or cubic acquires 4/6/8 segments which may be spread +// across one or more congtiguous blocks. +// +// If a flush() occurs then the remaining columns of multi-segment +// paths are initialized with zero-length line, quad, cubic elements. +// +// Every block's command word has a type and a count acquired from a +// rolling counter. +// +// The kernel is passed two spans of blocks { base, count } to +// process. The grid is must process (lo.count + hi.count) blocks. +// + +struct skc_subbuffer_blocks +{ + cl_mem device; + void * host; +}; + +struct skc_subbuffer_cmds +{ + cl_mem device; + void * host; + cl_event map; +}; + +// +// ringdex is an index with range [0, blocks-per-subbuf * subbufs-per-buffer ) +// + +typedef skc_uint skc_ringdex_t; + +union skc_ringdex_expand +{ + div_t qr; + + struct { +#ifndef SKC_DIV_REM_BEFORE_QUOT // offsetof(div_t,quot) != 0 + skc_uint subbuf; + skc_uint block; +#else + skc_uint block; + skc_uint subbuf; +#endif + }; +}; + +// +// this record is executed by the grid +// + +struct skc_release_record +{ + struct skc_path_builder_impl * impl; // back pointer to impl + + skc_grid_t grid; // pointer to scheduled grid + + skc_uint from; // inclusive starting index : [from,to) + skc_uint to; // non-inclusive ending index : [from,to) +}; + +// +// +// + +struct skc_path_builder_impl +{ + struct skc_path_builder * path_builder; + + struct skc_runtime * runtime; + + cl_command_queue cq; + + struct { + cl_kernel alloc; + cl_kernel copy; + } kernels; + + // + // FIXME -- make this pointer to constant config + // + // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv + struct { + skc_uint subbufs; // how many subbufs in the buffer? + + struct { + skc_uint buffer; // how many blocks in the buffer? + skc_uint subbuf; // how many blocks in a subbuf? + } blocks_per; + } ring; + // + // ^^^^^^^^^^^ don't duplicate these constants ^^^^^^^^^^^^^^^^^^ + // + + struct { + cl_mem buffer; // backing buffer for blocks + struct skc_subbuffer_blocks * subbufs; // array of structures + } blocks; + + struct { + cl_mem buffer; // backing buffer for commands + struct skc_subbuffer_cmds * subbufs; // array of structures + } cmds; + + struct { + struct skc_release_record * records; // max release records is equal to max subbufs + skc_path_t * paths; // max paths is less than or equal to max commands + } release; + + cl_mem reads; // each kernel only requires one word to store the block pool "base" + + struct { + skc_uint rolling; // rolling counter used by cmds to map to block pool alloc + skc_ringdex_t from; + skc_ringdex_t to; + } prev; + + struct { + skc_ringdex_t from; + skc_ringdex_t to; + } curr; + + struct { + struct skc_path_head * head; // pointer to local path header -- not written until path end + struct skc_path_node * node; // pointer to local node -- may alias head until head is full + + struct { + skc_uint rolling; // rolling counter of wip node -- valid after one node is allocated + union skc_tagged_block_id * next; // next slot in node -- may initially point to head.ids + skc_uint rem; // how many id slots left in node block + } ids; + + struct { + skc_uint rem; // how many subblocks left in block? + skc_uint rolling; // rolling counter of block of subblocks + float * next; // next subblock in current subblock block + skc_uint idx; // index of next subblock + } subblocks; + + struct { + skc_uint one; // .block = 1 + skc_uint next; // rolling counter used by cmds to map to block pool alloc + } rolling; + + skc_ringdex_t to; // ringdex of _next_available_ command/block in ring -- FIXME -- should be current + } wip; +}; + +// +// FIXME -- move to a pow2 subbuffer size and dispense with division +// and modulo operations +// + +static +union skc_ringdex_expand +skc_ringdex_expand(struct skc_path_builder_impl * const impl, + skc_ringdex_t const ringdex) +{ + return (union skc_ringdex_expand){ + .qr = div(ringdex,impl->ring.blocks_per.subbuf) + }; +} + +static +void +skc_ringdex_wip_to_block_inc(struct skc_path_builder_impl * const impl) +{ + // + // FIXME - which is faster? + // +#if 1 + impl->wip.to = (impl->wip.to + 1) % impl->ring.blocks_per.buffer; +#else + impl->wip.to -= (impl->wip.to < impl->ring.blocks_per.buffer) ? -1 : impl->wip.to; +#endif + + // this path is too long -- for now assert() and die + assert(impl->wip.to != impl->curr.from); +} + +static +skc_ringdex_t +skc_ringdex_span(struct skc_path_builder_impl * const impl, + skc_ringdex_t const from, + skc_ringdex_t const to) +{ + return (to - from) % impl->ring.blocks_per.buffer; +} + +static +void +skc_ringdex_wip_to_subbuf_inc(struct skc_path_builder_impl * const impl) +{ + union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to); + + // nothing to do if this is the first block in the subbuf + if (to.block == 0) + return; + + skc_uint const new_subbuf = (to.subbuf + 1) % impl->ring.subbufs; + + // otherwise increment and mod + impl->wip.to = new_subbuf * impl->ring.blocks_per.subbuf; +} + +static +skc_bool +skc_ringdex_curr_is_equal(struct skc_path_builder_impl * const impl) +{ + return impl->curr.from == impl->curr.to; +} + +static +skc_bool +skc_ringdex_prev_is_equal(struct skc_path_builder_impl * const impl) +{ + return impl->prev.from == impl->prev.to; +} + +static +skc_uint +skc_ringdex_dont_map_last(struct skc_path_builder_impl * const impl, + skc_uint const to_block) +{ + // no blocks acquired OR this is last block in subbuf + return !((impl->wip.to == impl->curr.to) || (to_block == 0)); +} + +// +// +// + +static +struct skc_release_record * +skc_release_curr(struct skc_path_builder_impl * const impl) +{ + union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from); + + return impl->release.records + curr_from.subbuf; +} + +// +// FIXME -- get rid of all distant config references -- grab them at all at creation time +// + +static +void +skc_path_builder_pfn_begin(struct skc_path_builder_impl * const impl) +{ + // init header counters // { handle, blocks, nodes, prims } + impl->wip.head->header = (union skc_path_header){ + .handle = 0, + .blocks = 0, + .nodes = 0, + .prims = 0 + }; + + // FIXME -- BOUNDS SHOULD USE SIMD4 TRICK AND NEGATE ONE OF THE CORNERS + impl->wip.head->bounds = (union skc_path_bounds){ +FLT_MIN, +FLT_MIN, -FLT_MIN, -FLT_MIN }; + + // point wip ids at local head node + impl->wip.ids.next = impl->wip.head->tag_ids; // point to local head node + impl->wip.ids.rem = impl->runtime->config->block.words - SKC_PATH_HEAD_WORDS; // FIXME -- save this constant somewhere + + // start with no subblocks + impl->wip.subblocks.rem = 0; +} + +// +// +// + +static +void +skc_path_builder_impl_finalize_node(struct skc_path_builder_impl * const impl) +{ +#if 1 + // + // FIXME -- a Duff's device might be optimal here but would have to + // be customized per device since node's could be 16-128+ words + // + while (impl->wip.ids.rem > 0) + { + impl->wip.ids.rem -= 1; + impl->wip.ids.next->u32 = SKC_TAGGED_BLOCK_ID_INVALID; + impl->wip.ids.next += 1; + } +#else + memset(&impl->wip.ids.next->u32, + SKC_TAGGED_BLOCK_ID_INVALID, // 0xFF + sizeof(impl->wip.ids.next->u32) * impl->wip.ids.rem); + + impl->wip.ids.next += impl->wip.ids.rem; + impl->wip.ids.rem = 0; +#endif +} + +// +// +// + +static +void +skc_zero_float(skc_float * p, skc_uint rem) +{ + memset(p,0,sizeof(*p)*rem); +} + +static +void +skc_path_builder_finalize_subblocks(struct skc_path_builder * const path_builder) +{ + // + // FIXME -- it might be more performant to zero the remaining + // columns in a subblock -- a subblock at a time -- instead of the + // same column across all the subblocks + // +#if 0 + while (path_builder->line.rem > 0) + { + --path_builder->line.rem; + + *path_builder->line.coords[0]++ = 0.0f; + *path_builder->line.coords[1]++ = 0.0f; + *path_builder->line.coords[2]++ = 0.0f; + *path_builder->line.coords[3]++ = 0.0f; + } + + while (path_builder->quad.rem > 0) + { + --path_builder->quad.rem; + + *path_builder->line.coords[0]++ = 0.0f; + *path_builder->line.coords[1]++ = 0.0f; + *path_builder->line.coords[2]++ = 0.0f; + *path_builder->line.coords[3]++ = 0.0f; + *path_builder->line.coords[4]++ = 0.0f; + *path_builder->line.coords[5]++ = 0.0f; + } + + while (path_builder->cubic.rem > 0) + { + --path_builder->cubic.rem; + + *path_builder->line.coords[0]++ = 0.0f; + *path_builder->line.coords[1]++ = 0.0f; + *path_builder->line.coords[2]++ = 0.0f; + *path_builder->line.coords[3]++ = 0.0f; + *path_builder->line.coords[4]++ = 0.0f; + *path_builder->line.coords[5]++ = 0.0f; + *path_builder->line.coords[6]++ = 0.0f; + *path_builder->line.coords[7]++ = 0.0f; + } +#else + if (path_builder->line.rem > 0) + { + skc_zero_float(path_builder->line.coords[0],path_builder->line.rem); + skc_zero_float(path_builder->line.coords[1],path_builder->line.rem); + skc_zero_float(path_builder->line.coords[2],path_builder->line.rem); + skc_zero_float(path_builder->line.coords[3],path_builder->line.rem); + + path_builder->line.rem = 0; + } + + if (path_builder->quad.rem > 0) + { + skc_zero_float(path_builder->quad.coords[0],path_builder->quad.rem); + skc_zero_float(path_builder->quad.coords[1],path_builder->quad.rem); + skc_zero_float(path_builder->quad.coords[2],path_builder->quad.rem); + skc_zero_float(path_builder->quad.coords[3],path_builder->quad.rem); + skc_zero_float(path_builder->quad.coords[4],path_builder->quad.rem); + skc_zero_float(path_builder->quad.coords[5],path_builder->quad.rem); + + path_builder->quad.rem = 0; + } + + if (path_builder->cubic.rem > 0) + { + skc_zero_float(path_builder->cubic.coords[0],path_builder->cubic.rem); + skc_zero_float(path_builder->cubic.coords[1],path_builder->cubic.rem); + skc_zero_float(path_builder->cubic.coords[2],path_builder->cubic.rem); + skc_zero_float(path_builder->cubic.coords[3],path_builder->cubic.rem); + skc_zero_float(path_builder->cubic.coords[4],path_builder->cubic.rem); + skc_zero_float(path_builder->cubic.coords[5],path_builder->cubic.rem); + skc_zero_float(path_builder->cubic.coords[6],path_builder->cubic.rem); + skc_zero_float(path_builder->cubic.coords[7],path_builder->cubic.rem); + + path_builder->cubic.rem = 0; + } +#endif +} + +// +// +// + +static +void +skc_path_builder_impl_unmap(struct skc_path_builder_impl * const impl, + skc_uint from, + skc_uint to) +{ + // to might be out of range + to = to % impl->ring.subbufs; + +#if 0 + fprintf(stderr,"unmap: [%2u,%2u)\n",from,to); +#endif + + while (from != to) // 'to' might be out of range + { + // bring 'from' back in range + from = from % impl->ring.subbufs; + + struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from; + struct skc_subbuffer_cmds * const cmds = impl->cmds .subbufs + from; + + cl(EnqueueUnmapMemObject(impl->cq, + blocks->device, + blocks->host, + 0,NULL,NULL)); + + cl(EnqueueUnmapMemObject(impl->cq, + cmds->device, + cmds->host, + 0,NULL,NULL)); + + // bring from back in range + from = ++from % impl->ring.subbufs; + } +} + +// +// FIXME -- reuse this in create() +// + +static +void +skc_path_builder_impl_map(struct skc_path_builder_impl * const impl, + skc_uint from, + skc_uint to) +{ + // to might be out of range + to = to % impl->ring.subbufs; + +#if 0 + fprintf(stderr," map: [%2u,%2u)\n",from,to); +#endif + + while (from != to) + { + cl_int cl_err; + + struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from; + struct skc_subbuffer_cmds * const cmds = impl->cmds .subbufs + from; + + blocks->host = clEnqueueMapBuffer(impl->cq, + blocks->device, + CL_FALSE, + CL_MAP_WRITE_INVALIDATE_REGION, + 0,impl->runtime->config->paths_copy.block.subbuf, + 0,NULL,NULL, + &cl_err); cl_ok(cl_err); + + cl(ReleaseEvent(cmds->map)); + + cmds->host = clEnqueueMapBuffer(impl->cq, + cmds->device, + CL_FALSE, + CL_MAP_WRITE_INVALIDATE_REGION, + 0,impl->runtime->config->paths_copy.command.subbuf, + 0,NULL,&cmds->map, + &cl_err); cl_ok(cl_err); + + // bring from back in range + from = ++from % impl->ring.subbufs; + } + // + // FIXME -- when we switch to out of order queues we'll need a barrier here + // +} + +// +// +// + +static +void +skc_path_builder_release_dispose(struct skc_release_record * const release, + struct skc_path_builder_impl * const impl) +{ + struct skc_runtime * runtime = impl->runtime; + + if (release->from <= release->to) // no wrap + { + skc_path_t const * paths = impl->release.paths + release->from; + skc_uint count = release->to - release->from; + + skc_grid_deps_unmap(runtime->deps,paths,count); + skc_runtime_path_device_release(runtime,paths,count); + } + else // from > to implies wrap + { + skc_path_t const * paths_lo = impl->release.paths + release->from; + skc_uint count_lo = impl->ring.blocks_per.buffer - release->from; + + skc_grid_deps_unmap(runtime->deps,paths_lo,count_lo); + skc_runtime_path_device_release(runtime,paths_lo,count_lo); + + skc_grid_deps_unmap(runtime->deps,impl->release.paths,release->to); + skc_runtime_path_device_release(runtime,impl->release.paths,release->to); + } + + release->to = release->from; +} + +static +void +skc_path_builder_grid_pfn_dispose(skc_grid_t const grid) +{ + struct skc_release_record * const release = skc_grid_get_data(grid); + struct skc_path_builder_impl * const impl = release->impl; + + skc_path_builder_release_dispose(release,impl); +} + +static +void +// skc_path_builder_complete(struct skc_release_record * const release) +skc_path_builder_complete(skc_grid_t grid) +{ + // + // notify deps that this grid is complete enough for other grids to + // proceed + // + // the path builder still has some cleanup to do before all its + // resources can be reused + // + skc_grid_complete(grid); +} + +static +void +skc_path_builder_paths_copy_cb(cl_event event, cl_int status, skc_grid_t grid) +{ + SKC_CL_CB(status); + + struct skc_release_record * const release = skc_grid_get_data(grid); + + SKC_SCHEDULER_SCHEDULE(release->impl->runtime->scheduler,skc_path_builder_complete,grid); +} + +// +// +// + +static +void +skc_path_builder_grid_pfn_waiting(skc_grid_t const grid) +{ + struct skc_release_record * const release = skc_grid_get_data(grid); + struct skc_path_builder_impl * const impl = release->impl; + + // 1. flush incomplete subblocks of path elements + // 2. unmap subbuffer on cq.unmap + // 3. flush cq.unmap + // 4. launch kernel on cq.kernel but wait for unmap completion + // 5. flush cq.kernel + // 6. remap relevant subbuffers on cq.map but wait for kernel completion + // 7. flush cq.map + + // + // FIXME -- can be smarter about flushing if the wip paths are not + // in the same subbuf as curr.to + // + // THIS IS IMPORTANT TO FIX + // + + // flush incomplete subblocks + skc_path_builder_finalize_subblocks(impl->path_builder); + + // + // get range of subbufs that need to be unmapped + // + // note that impl->prev subbufs have already been unmapped + // + union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from); + union skc_ringdex_expand curr_to = skc_ringdex_expand(impl,impl->curr.to); + skc_uint const is_partial = curr_to.block > 0; + skc_uint const unmap_to = curr_to.subbuf + is_partial; + + // + // unmap all subbufs in range [from,to) + // + skc_path_builder_impl_unmap(impl,curr_from.subbuf,unmap_to); + + // + // launch kernels + // + skc_uint const pb_prev_span = skc_ringdex_span(impl,impl->prev.from,impl->prev.to); + skc_uint const pb_curr_span = skc_ringdex_span(impl,impl->curr.from,impl->curr.to); + skc_uint const pb_cmds = pb_prev_span + pb_curr_span; + + // + // 1) allocate blocks from pool + // + + // + // FIXME -- pack integers into struct/vector + // + cl(SetKernelArg(impl->kernels.alloc,0,SKC_CL_ARG(impl->runtime->block_pool.atomics.drw))); + cl(SetKernelArg(impl->kernels.alloc,1,SKC_CL_ARG(impl->reads))); + cl(SetKernelArg(impl->kernels.alloc,2,SKC_CL_ARG(curr_from.subbuf))); + cl(SetKernelArg(impl->kernels.alloc,3,SKC_CL_ARG(pb_cmds))); + + skc_device_enqueue_kernel(impl->runtime->device, + SKC_DEVICE_KERNEL_ID_PATHS_ALLOC, + impl->cq, + impl->kernels.alloc, + 1, + 0,NULL,NULL); + + // + // 2) copy blocks from unmapped device-accessible memory + // + + // + // FIXME -- pack integers into struct/vector and reduce 13 arguments down to 7 + // + cl(SetKernelArg(impl->kernels.copy, 0,SKC_CL_ARG(impl->runtime->handle_pool.map.drw))); + + cl(SetKernelArg(impl->kernels.copy, 1,SKC_CL_ARG(impl->runtime->block_pool.ids.drw))); + cl(SetKernelArg(impl->kernels.copy, 2,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw))); + cl(SetKernelArg(impl->kernels.copy, 3,SKC_CL_ARG(impl->runtime->block_pool.size->ring_mask))); + + cl(SetKernelArg(impl->kernels.copy, 4,SKC_CL_ARG(impl->reads))); + cl(SetKernelArg(impl->kernels.copy, 5,SKC_CL_ARG(curr_from.subbuf))); + + cl(SetKernelArg(impl->kernels.copy, 6,SKC_CL_ARG(impl->cmds.buffer))); + cl(SetKernelArg(impl->kernels.copy, 7,SKC_CL_ARG(impl->blocks.buffer))); + + cl(SetKernelArg(impl->kernels.copy, 8,SKC_CL_ARG(impl->ring.blocks_per.buffer))); + cl(SetKernelArg(impl->kernels.copy, 9,SKC_CL_ARG(impl->prev.rolling))); + + cl(SetKernelArg(impl->kernels.copy,10,SKC_CL_ARG(impl->prev.from))); + cl(SetKernelArg(impl->kernels.copy,11,SKC_CL_ARG(pb_prev_span))); + cl(SetKernelArg(impl->kernels.copy,12,SKC_CL_ARG(impl->curr.from))); + + cl_event complete; + + skc_device_enqueue_kernel(impl->runtime->device, + SKC_DEVICE_KERNEL_ID_PATHS_COPY, + impl->cq, + impl->kernels.copy, + pb_cmds, + 0,NULL,&complete); + + // set a callback on completion + cl(SetEventCallback(complete,CL_COMPLETE, + skc_path_builder_paths_copy_cb, + grid)); + + // immediately release + cl(ReleaseEvent(complete)); + + // + // remap as many subbuffers as possible after the kernel completes + // + // note that remaps are async and enqueued on the same command queue + // as the kernel launch + // + // we can't remap subbuffers that are in the possibly empty range + // + // cases: + // + // - curr.to == wip.to which means no blocks have been acquired + // - curr.to points to first block in (next) subbuf + // - otherwise, wip acquired blocks in the curr.to subbuf + // + // check for these first 2 cases! + // + union skc_ringdex_expand const prev_from = skc_ringdex_expand(impl,impl->prev.from); + skc_uint const no_wip = impl->curr.to == impl->wip.to; + skc_uint map_to = curr_to.subbuf + (is_partial && no_wip); + + // remap all subbufs in range [from,to) + skc_path_builder_impl_map(impl,prev_from.subbuf,map_to); + + // flush command queue + cl(Flush(impl->cq)); + + // save rolling + impl->prev.rolling = impl->wip.rolling.next; + + // update prev and curr + if (no_wip) + { + // + // if there was no wip then round up to the next subbuf + // + skc_ringdex_wip_to_subbuf_inc(impl); + + // + // update prev/curr with with incremented wip + // + impl->prev.from = impl->prev.to = impl->wip.to; + impl->curr.from = impl->curr.to = impl->wip.to; + } + else + { + // + // update prev with wip partials + // + impl->prev.from = impl->curr.to; + impl->prev.to = impl->wip .to; + + // + // start curr on a new subbuf boundary + // + skc_ringdex_wip_to_subbuf_inc(impl); + + impl->curr.from = impl->wip.to; + impl->curr.to = impl->wip.to; + } +} + +// +// +// + +static +void +skc_path_builder_impl_acquire_subbuffer(struct skc_path_builder_impl * const impl, + skc_uint const subbuf) +{ + // + // FIXME -- move to a power-of-two subbuf size and kickstart path + // copies as early as possible + // + // FIXME -- the subbufs "self-clock" (flow control) the kernel + // launches and accounting. Combine all the subbuffers and release + // records into a single indexable struct instead of 3. + // + struct skc_subbuffer_cmds * const sc = impl->cmds.subbufs + subbuf; + struct skc_release_record * const release = impl->release.records + subbuf; + struct skc_scheduler * const scheduler = impl->runtime->scheduler; + + // can't proceed until the paths have been released + SKC_SCHEDULER_WAIT_WHILE(scheduler,release->from != release->to); + + // throw in a scheduler yield ... FIXME -- get rid of + skc_scheduler_yield(scheduler); + + // can't proceed until the subbuffer is mapped + cl(WaitForEvents(1,&sc->map)); +} + +// +// +// + +static +union skc_ringdex_expand +skc_path_builder_impl_acquire_block(struct skc_path_builder_impl * const impl) +{ + // break ringdex into components + union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to); + + // does wip ringdex point to a new subbuffer? + if (to.block == 0) + { + // potentially spin/block waiting for subbuffer + skc_path_builder_impl_acquire_subbuffer(impl,to.subbuf); + } + + // post increment wip.to + skc_ringdex_wip_to_block_inc(impl); + + return to; +} + +// +// +// + +static +skc_uint +skc_rolling_block(skc_uint const rolling, skc_uint const tag) +{ + return rolling | tag; +} + +static +skc_uint +skc_rolling_subblock(skc_uint const rolling, skc_uint const subblock, skc_uint const tag) +{ + return rolling | (subblock << SKC_TAGGED_BLOCK_ID_BITS_TAG) | tag; +} + +static +void +skc_rolling_inc(struct skc_path_builder_impl * const impl) +{ + impl->wip.rolling.next += impl->wip.rolling.one; +} + +// +// +// + +static +void * +skc_path_builder_impl_new_command(struct skc_path_builder_impl * const impl, + skc_uint const rolling, + skc_cmd_paths_copy_tag const tag) +{ + // bump blocks count + impl->wip.head->header.blocks += 1; + + // acquire a block + union skc_ringdex_expand const to = skc_path_builder_impl_acquire_block(impl); + + // make a pointer + union skc_tagged_block_id * const cmds_subbuf = impl->cmds.subbufs[to.subbuf].host; + + // store command for block + cmds_subbuf[to.block].u32 = skc_rolling_block(rolling,tag); + +#if 0 + // store command for block + cmds_subbuf[to.block].u32 = skc_rolling_block(impl->wip.rolling.next,tag); + + // increment rolling + skc_rolling_inc(impl); +#endif + + // return pointer to block + float * const blocks_subbuf = impl->blocks.subbufs[to.subbuf].host; + + // FIXME -- make it easier to get config constant + return blocks_subbuf + (to.block * impl->runtime->config->block.words); +} + +// +// +// + +static +void +skc_path_builder_impl_flush_node(struct skc_path_builder_impl * const impl) +{ + // store command to subbuf and get pointer to blocks subbuf + void * const block = skc_path_builder_impl_new_command(impl,impl->wip.ids.rolling, + SKC_CMD_PATHS_COPY_TAG_NODE); + + // copy head to blocks subbuf -- write-only + memcpy(block,impl->wip.node,impl->runtime->config->block.bytes); +} + +static +void +skc_path_builder_impl_flush_head(struct skc_path_builder_impl * const impl) +{ + // store command to subbuf and get pointer to blocks subbuf + void * const block = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next, + SKC_CMD_PATHS_COPY_TAG_HEAD); + + // copy head to blocks subbuf -- write-only + memcpy(block,impl->wip.head,impl->runtime->config->block.bytes); + + // increment rolling + skc_rolling_inc(impl); + + // the 'to' index is non-inclusive so assign wip.to after flush_head + impl->curr.to = impl->wip.to; +} + +// +// +// + +static +void +skc_path_builder_impl_new_node_block(struct skc_path_builder_impl * const impl) +{ + // update final block id in node + impl->wip.ids.next->u32 = skc_rolling_block(impl->wip.rolling.next,SKC_BLOCK_ID_TAG_PATH_NEXT); + + // if wip.ids is not the header then flush now full wip node + if (impl->wip.head->header.nodes > 0) + skc_path_builder_impl_flush_node(impl); + + // bump node count + impl->wip.head->header.nodes += 1; + + // save current rolling + impl->wip.ids.rolling = impl->wip.rolling.next; + + // increment rolling + skc_rolling_inc(impl); + + // update wip.ids.* + impl->wip.ids.next = impl->wip.node->tag_ids; + impl->wip.ids.rem = impl->runtime->config->block.words; +} + +static +void +skc_path_builder_impl_new_segs_block(struct skc_path_builder_impl * const impl) +{ + impl->wip.subblocks.rem = impl->runtime->config->block.subblocks; // FIXME -- move constants closer to structure + impl->wip.subblocks.rolling = impl->wip.rolling.next; + impl->wip.subblocks.next = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next, + SKC_CMD_PATHS_COPY_TAG_SEGS); + impl->wip.subblocks.idx = 0; + + // increment rolling + skc_rolling_inc(impl); +} + +// +// +// + +static +void +skc_path_builder_impl_acquire_subblocks(struct skc_path_builder_impl * const impl, + skc_block_id_tag tag, + skc_uint vertices, + float * * subblocks) +{ + // + // FIRST TAG RECORDS THE ELEMENT TYPE + // + while (true) + { + // if only one block id left in node then acquire new node block + // and append its block id as with a next tag + if (impl->wip.ids.rem == 1) + skc_path_builder_impl_new_node_block(impl); + + // if zero subblocks left then acquire a new subblock block and + // append its block id + if (impl->wip.subblocks.rem == 0) + skc_path_builder_impl_new_segs_block(impl); + + // save first command -- tag and subblocks may have been updated + impl->wip.ids.next->u32 = skc_rolling_subblock(impl->wip.subblocks.rolling,impl->wip.subblocks.idx,tag); + + // increment node block subblock pointer + impl->wip.ids.next += 1; + impl->wip.ids.rem -= 1; + + // how many vertices can we store + skc_uint rem = min(vertices,impl->wip.subblocks.rem); + + // decrement vertices + vertices -= rem; + impl->wip.subblocks.rem -= rem; + impl->wip.subblocks.idx += rem; + + // assign subblocks + do { + *subblocks++ = impl->wip.subblocks.next; + impl->wip.subblocks.next += impl->runtime->config->subblock.words; + // FIXME -- move constants closer to structure + } while (--rem > 0); + + // anything left to do? + if (vertices == 0) + break; + + // any tag after this will be a caboose command + tag = SKC_BLOCK_ID_TAG_PATH_NEXT; + } +} + +// +// +// + +static +void +skc_path_builder_pfn_end(struct skc_path_builder_impl * const impl, skc_path_t * const path) +{ + // finalize incomplete active subblocks -- we don't care about any + // remaining unused subblocks in block + skc_path_builder_finalize_subblocks(impl->path_builder); + + // mark remaining wips.ids in the head or node as invalid + skc_path_builder_impl_finalize_node(impl); + + // flush node if rem > 0 and node is not actually head + if (impl->wip.head->header.nodes >= 1) + skc_path_builder_impl_flush_node(impl); + + // acquire path host id + *path = skc_runtime_handle_device_acquire(impl->runtime); // FIXME -- MAY WANT TO GRAB AN ID ON BEGIN + + // save path host handle + impl->wip.head->header.handle = *path; + + // flush head -- acquires a block and bumps head->header.blocks + skc_path_builder_impl_flush_head(impl); + + // get current release + struct skc_release_record * const release = skc_release_curr(impl); + + // acquire grid if null + if (release->grid == NULL) + { + release->grid = + SKC_GRID_DEPS_ATTACH(impl->runtime->deps, + &release->grid, // NULL on start/force + release, // data payload + skc_path_builder_grid_pfn_waiting, + NULL, // no execute pfn + skc_path_builder_grid_pfn_dispose); + } + + // update grid map + skc_grid_map(release->grid,*path); + + // update path release + impl->release.paths[release->to] = *path; + + // increment release.to + release->to = (release->to + 1) % impl->ring.blocks_per.buffer; + + // add guard bit + *path |= SKC_TYPED_HANDLE_TYPE_IS_PATH; + +#if 1 + // + // eager kernel launch? + // + { + union skc_ringdex_expand const curr_from = skc_ringdex_expand(impl,impl->curr.from); + union skc_ringdex_expand const curr_to = skc_ringdex_expand(impl,impl->curr.to); + + if (curr_from.subbuf != curr_to.subbuf) + { + skc_grid_start(release->grid); + // skc_scheduler_yield(impl->runtime->scheduler); + } + } +#endif +} + +// +// FIXME -- clean up accessing of CONFIG constants in these 3 routines +// + +static +void +skc_path_builder_pfn_new_line(struct skc_path_builder_impl * const impl) +{ + // acquire subblock pointers + skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_LINE,4, + impl->path_builder->line.coords); + + // increment line count + impl->wip.head->header.prims += 1; + + // update rem_count_xxx count + impl->path_builder->line.rem = impl->runtime->config->subblock.words; +} + +static +void +skc_path_builder_pfn_new_quad(struct skc_path_builder_impl * const impl) +{ + // acquire subblock pointers + skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_QUAD,6, + impl->path_builder->quad.coords); + + // increment line count + impl->wip.head->header.prims += 1; + + // update rem_count_xxx count + impl->path_builder->quad.rem = impl->runtime->config->subblock.words; +} + +static +void +skc_path_builder_pfn_new_cubic(struct skc_path_builder_impl * const impl) +{ + // acquire subblock pointers + skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_CUBIC,8, + impl->path_builder->cubic.coords); + + // increment line count + impl->wip.head->header.prims += 1; + + // update rem_count_xxx count + impl->path_builder->cubic.rem = impl->runtime->config->subblock.words; +} + +// +// +// + +static +void +skc_path_builder_pfn_release(struct skc_path_builder_impl * const impl) +{ + // decrement reference count + if (--impl->path_builder->refcount != 0) + return; + + // + // otherwise, dispose of everything + // + struct skc_runtime * const runtime = impl->runtime; + + // free path builder + skc_runtime_host_perm_free(impl->runtime,impl->path_builder); + + // release cq + skc_runtime_release_cq_in_order(runtime,impl->cq); + + // release kernels + cl(ReleaseKernel(impl->kernels.alloc)); + cl(ReleaseKernel(impl->kernels.copy)); + + // free blocks extents + cl(ReleaseMemObject(impl->blocks.buffer)); + skc_runtime_host_perm_free(runtime,impl->blocks.subbufs); + + cl(ReleaseMemObject(impl->cmds.buffer)); + skc_runtime_host_perm_free(runtime,impl->cmds.subbufs); + + // free records + skc_runtime_host_perm_free(runtime,impl->release.records); + skc_runtime_host_perm_free(runtime,impl->release.paths); + + // release staging head and node + skc_runtime_host_perm_free(runtime,impl->wip.head); + skc_runtime_host_perm_free(runtime,impl->wip.node); + + // release reads scratch array + cl(ReleaseMemObject(impl->reads)); + + // for all subbuffers + // unmap subbuffer + // release subbuffer + // printf("%s not releasing subbuffers\n",__func__); + + skc_runtime_host_perm_free(impl->runtime,impl); +} + +// +// +// + +skc_err +skc_path_builder_cl_12_create(struct skc_context * const context, + struct skc_path_builder * * const path_builder) +{ + // + // retain the context + // skc_context_retain(context); + // + struct skc_runtime * const runtime = context->runtime; + + // allocate path builder + (*path_builder) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**path_builder)); + + // init state + SKC_ASSERT_STATE_INIT((*path_builder),SKC_PATH_BUILDER_STATE_READY); + + (*path_builder)->context = context; + + // save opaque impl-specific pointers + (*path_builder)->begin = skc_path_builder_pfn_begin; + (*path_builder)->end = skc_path_builder_pfn_end; + (*path_builder)->new_line = skc_path_builder_pfn_new_line; + (*path_builder)->new_quad = skc_path_builder_pfn_new_quad; + (*path_builder)->new_cubic = skc_path_builder_pfn_new_cubic; + (*path_builder)->release = skc_path_builder_pfn_release; + + // initialize path builder counts + (*path_builder)->line.rem = 0; + (*path_builder)->quad.rem = 0; + (*path_builder)->cubic.rem = 0; + + (*path_builder)->refcount = 1; + + struct skc_path_builder_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl)); + + (*path_builder)->impl = impl; + + // + // init impl + // + impl->path_builder = *path_builder; + impl->runtime = runtime; + + impl->cq = skc_runtime_acquire_cq_in_order(runtime); + + impl->kernels.alloc = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_ALLOC); + impl->kernels.copy = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_COPY); + + // + // FIXME -- let these config constants remain constant and in place + // + struct skc_config const * const config = runtime->config; + + impl->ring.subbufs = config->paths_copy.buffer.count; + impl->ring.blocks_per.buffer = config->paths_copy.subbuf.count * config->paths_copy.buffer.count; + impl->ring.blocks_per.subbuf = config->paths_copy.subbuf.count; + // + // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + // + + cl_int cl_err; + + // allocate large device-side extent for path data + impl->blocks.buffer = clCreateBuffer(runtime->cl.context, + CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, + config->paths_copy.block.buffer, // FIXME -- either use config or local constants everywhere + NULL,&cl_err); cl_ok(cl_err); + + // allocate small host-side array of pointers to mapped subbufs + impl->blocks.subbufs = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE, + impl->ring.subbufs * + sizeof(*impl->blocks.subbufs)); + + // allocate large device-side extent for path copy commands + impl->cmds.buffer = clCreateBuffer(runtime->cl.context, + CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, + config->paths_copy.command.buffer, + NULL,&cl_err); cl_ok(cl_err); + + // allocate small host-side array of pointers to mapped subbufs + impl->cmds.subbufs = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE, + impl->ring.subbufs * + sizeof(*impl->cmds.subbufs)); + + // allocate small host-side array of intervals of path handles + impl->release.records = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE, + impl->ring.subbufs * + sizeof(*impl->release.records)); + + // allocate large host-side array that is max # of path handles in flight + impl->release.paths = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE, + impl->ring.blocks_per.buffer * + sizeof(*impl->release.paths)); + + // small scratch used by kernels + impl->reads = clCreateBuffer(runtime->cl.context, + CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, + sizeof(skc_uint) * impl->ring.subbufs, + NULL,&cl_err); cl_ok(cl_err); + + // initialize release record with impl backpointer + for (skc_uint ii=0; ii<impl->ring.subbufs; ii++) + { + struct skc_release_record * record = impl->release.records + ii; + + record->impl = impl; + record->grid = NULL; + record->from = record->to = ii * impl->ring.blocks_per.subbuf; + } + + // + // allocate and map subbuffers -- we always check the command + // subbuffer's map/unmap events before touching it or its associated + // block subbuffer. + // + struct skc_subbuffer_blocks * sb = impl->blocks.subbufs; + struct skc_subbuffer_cmds * sc = impl->cmds .subbufs; + + cl_buffer_region rb = { 0, config->paths_copy.block.subbuf }; + cl_buffer_region rc = { 0, config->paths_copy.command.subbuf }; + + // for each subbuffer + for (skc_uint ii=0; ii<config->paths_copy.buffer.count; ii++) + { + sb->device = clCreateSubBuffer(impl->blocks.buffer, + CL_MEM_HOST_WRITE_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, + &rb, + &cl_err); cl_ok(cl_err); + + sb->host = clEnqueueMapBuffer(impl->cq, + sb->device, + CL_FALSE, + CL_MAP_WRITE_INVALIDATE_REGION, + 0,rb.size, + 0,NULL,NULL, + &cl_err); cl_ok(cl_err); + + sc->device = clCreateSubBuffer(impl->cmds.buffer, + CL_MEM_HOST_WRITE_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, + &rc, + &cl_err); cl_ok(cl_err); + + sc->host = clEnqueueMapBuffer(impl->cq, + sc->device, + CL_FALSE, + CL_MAP_WRITE_INVALIDATE_REGION, + 0,rc.size, + 0,NULL,&sc->map, + &cl_err); cl_ok(cl_err); + sb += 1; + sc += 1; + + rb.origin += rb.size; + rc.origin += rc.size; + } + + // + // initialize remaining members + // + impl->prev.from = 0; + impl->prev.to = 0; + impl->prev.rolling = 0; + + impl->curr.from = 0; + impl->curr.to = 0; + + impl->wip.to = 0; + + impl->wip.head = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes); + impl->wip.node = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes); + + impl->wip.rolling.one = SKC_BLOCK_ID_TAG_COUNT * config->block.subblocks; + impl->wip.rolling.next = 0; + + // for now, completely initialize builder before returning + cl(Finish(impl->cq)); + + return SKC_ERR_SUCCESS; +} + +// +// +// |