/* * Copyright 2017 Google Inc. * * Use of this source code is governed by a BSD-style license that can * be found in the LICENSE file. * */ // // // #include #include #include #include #include #include "common/cl/assert_cl.h" #include "context.h" #include "handle.h" #include "grid.h" #include "path.h" #include "path_builder.h" #include "config_cl.h" #include "export_cl_12.h" #include "runtime_cl_12.h" #include "path_builder_cl_12.h" // // OpenCL 1.2 devices support mapping of buffers into the host address // space. // // Mapped buffers must be aligned on MIN_DATA_TYPE_ALIGN_SIZE bit // boundary (e.g. 128 bytes). This complicates coordinating sharing // of data between the host and the device. // // Some OpenCL 2.0 devices support fine-grained shared virtual memory // pointers with byte-addressing and allow simpler coordination // strategies at the cost of maintaining cache coherency. // // The path builder is focused on moving bulk path data from the host // into the device-managed "block" memory pool and arranging it into a // SIMT/SIMD-friendly data structure that can be efficiently read by // the rasterizer. // // Note that one simplifying assumption is that the maximum length of // a *single* path can't be larger than what fits in the single extent // (which is split into M subbuffers). This would be a very long path // and a legitimate size limitation. // // For some systems, it may be appropriate to never pull path data // into the device-managed block pool and instead present the path // data to the device in a temporarily available allocated memory // "zone" of paths that can be discarded all at once. // // For other systems, it may be appropriate to simply copy the path // data from host to device. // // But the majority of OpenCL (and VK, MTL, DX12) devices we'll be // targeting support basic map/unmap functionality similar to OpenCL // 1.2. Furthermore, not all OpenCL 2.0 devices support fine-grained // sharing of memory and still require a map/unmap step... but note // that they all support byte-aligned mapping and subbuffers. // // The general strategy that this particular CL_12 implementation uses // is to allocate a large mappable bulk-data path buffer and an // auxilary mappable command buffer. // // The buffers are split into a reasonable number of properly aligned // subbuffers to enable simultaneous host and device access. // // // Blocks: // 1 extent // M mapped subbuffers (configurable) to allow for concurrency // // Commands: // 1 extent // M mapped subbuffers (configurable) to allow for concurrency // // Spans: // M hi/lo structures // // { cl_sub, void*, event, base } // // - size of sub buffer // - remaining // // - counts // // // For any kernel launch, at most one path will be discontiguous and // defined across two sub-buffers. // // Nodes are updated locally until full and then stored so they will // never be incomplete. Headers are stored locally until the path is // ended so they will never be incomplete. // // A line, quad or cubic acquires 4/6/8 segments which may be spread // across one or more congtiguous blocks. // // If a flush() occurs then the remaining columns of multi-segment // paths are initialized with zero-length line, quad, cubic elements. // // Every block's command word has a type and a count acquired from a // rolling counter. // // The kernel is passed two spans of blocks { base, count } to // process. The grid is must process (lo.count + hi.count) blocks. // struct skc_subbuffer_blocks { cl_mem device; void * host; }; struct skc_subbuffer_cmds { cl_mem device; void * host; cl_event map; }; // // ringdex is an index with range [0, blocks-per-subbuf * subbufs-per-buffer ) // typedef skc_uint skc_ringdex_t; union skc_ringdex_expand { div_t qr; struct { #ifndef SKC_DIV_REM_BEFORE_QUOT // offsetof(div_t,quot) != 0 skc_uint subbuf; skc_uint block; #else skc_uint block; skc_uint subbuf; #endif }; }; // // this record is executed by the grid // struct skc_release_record { struct skc_path_builder_impl * impl; // back pointer to impl skc_grid_t grid; // pointer to scheduled grid skc_uint from; // inclusive starting index : [from,to) skc_uint to; // non-inclusive ending index : [from,to) }; // // // struct skc_path_builder_impl { struct skc_path_builder * path_builder; struct skc_runtime * runtime; cl_command_queue cq; struct { cl_kernel alloc; cl_kernel copy; } kernels; // // FIXME -- make this pointer to constant config // // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv struct { skc_uint subbufs; // how many subbufs in the buffer? struct { skc_uint buffer; // how many blocks in the buffer? skc_uint subbuf; // how many blocks in a subbuf? } blocks_per; } ring; // // ^^^^^^^^^^^ don't duplicate these constants ^^^^^^^^^^^^^^^^^^ // struct { cl_mem buffer; // backing buffer for blocks struct skc_subbuffer_blocks * subbufs; // array of structures } blocks; struct { cl_mem buffer; // backing buffer for commands struct skc_subbuffer_cmds * subbufs; // array of structures } cmds; struct { struct skc_release_record * records; // max release records is equal to max subbufs skc_path_t * paths; // max paths is less than or equal to max commands } release; cl_mem reads; // each kernel only requires one word to store the block pool "base" struct { skc_uint rolling; // rolling counter used by cmds to map to block pool alloc skc_ringdex_t from; skc_ringdex_t to; } prev; struct { skc_ringdex_t from; skc_ringdex_t to; } curr; struct { struct skc_path_head * head; // pointer to local path header -- not written until path end struct skc_path_node * node; // pointer to local node -- may alias head until head is full struct { skc_uint rolling; // rolling counter of wip node -- valid after one node is allocated union skc_tagged_block_id * next; // next slot in node -- may initially point to head.ids skc_uint rem; // how many id slots left in node block } ids; struct { skc_uint rem; // how many subblocks left in block? skc_uint rolling; // rolling counter of block of subblocks float * next; // next subblock in current subblock block skc_uint idx; // index of next subblock } subblocks; struct { skc_uint one; // .block = 1 skc_uint next; // rolling counter used by cmds to map to block pool alloc } rolling; skc_ringdex_t to; // ringdex of _next_available_ command/block in ring -- FIXME -- should be current } wip; }; // // FIXME -- move to a pow2 subbuffer size and dispense with division // and modulo operations // static union skc_ringdex_expand skc_ringdex_expand(struct skc_path_builder_impl * const impl, skc_ringdex_t const ringdex) { return (union skc_ringdex_expand){ .qr = div(ringdex,impl->ring.blocks_per.subbuf) }; } static void skc_ringdex_wip_to_block_inc(struct skc_path_builder_impl * const impl) { // // FIXME - which is faster? // #if 1 impl->wip.to = (impl->wip.to + 1) % impl->ring.blocks_per.buffer; #else impl->wip.to -= (impl->wip.to < impl->ring.blocks_per.buffer) ? -1 : impl->wip.to; #endif // this path is too long -- for now assert() and die assert(impl->wip.to != impl->curr.from); } static skc_ringdex_t skc_ringdex_span(struct skc_path_builder_impl * const impl, skc_ringdex_t const from, skc_ringdex_t const to) { return (to - from) % impl->ring.blocks_per.buffer; } static void skc_ringdex_wip_to_subbuf_inc(struct skc_path_builder_impl * const impl) { union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to); // nothing to do if this is the first block in the subbuf if (to.block == 0) return; skc_uint const new_subbuf = (to.subbuf + 1) % impl->ring.subbufs; // otherwise increment and mod impl->wip.to = new_subbuf * impl->ring.blocks_per.subbuf; } static skc_bool skc_ringdex_curr_is_equal(struct skc_path_builder_impl * const impl) { return impl->curr.from == impl->curr.to; } static skc_bool skc_ringdex_prev_is_equal(struct skc_path_builder_impl * const impl) { return impl->prev.from == impl->prev.to; } static skc_uint skc_ringdex_dont_map_last(struct skc_path_builder_impl * const impl, skc_uint const to_block) { // no blocks acquired OR this is last block in subbuf return !((impl->wip.to == impl->curr.to) || (to_block == 0)); } // // // static struct skc_release_record * skc_release_curr(struct skc_path_builder_impl * const impl) { union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from); return impl->release.records + curr_from.subbuf; } // // FIXME -- get rid of all distant config references -- grab them at all at creation time // static void skc_path_builder_pfn_begin(struct skc_path_builder_impl * const impl) { // init header counters // { handle, blocks, nodes, prims } impl->wip.head->header = (union skc_path_header){ .handle = 0, .blocks = 0, .nodes = 0, .prims = 0 }; // FIXME -- BOUNDS SHOULD USE SIMD4 TRICK AND NEGATE ONE OF THE CORNERS impl->wip.head->bounds = (union skc_path_bounds){ +FLT_MIN, +FLT_MIN, -FLT_MIN, -FLT_MIN }; // point wip ids at local head node impl->wip.ids.next = impl->wip.head->tag_ids; // point to local head node impl->wip.ids.rem = impl->runtime->config->block.words - SKC_PATH_HEAD_WORDS; // FIXME -- save this constant somewhere // start with no subblocks impl->wip.subblocks.rem = 0; } // // // static void skc_path_builder_impl_finalize_node(struct skc_path_builder_impl * const impl) { #if 1 // // FIXME -- a Duff's device might be optimal here but would have to // be customized per device since node's could be 16-128+ words // while (impl->wip.ids.rem > 0) { impl->wip.ids.rem -= 1; impl->wip.ids.next->u32 = SKC_TAGGED_BLOCK_ID_INVALID; impl->wip.ids.next += 1; } #else memset(&impl->wip.ids.next->u32, SKC_TAGGED_BLOCK_ID_INVALID, // 0xFF sizeof(impl->wip.ids.next->u32) * impl->wip.ids.rem); impl->wip.ids.next += impl->wip.ids.rem; impl->wip.ids.rem = 0; #endif } // // // static void skc_zero_float(skc_float * p, skc_uint rem) { memset(p,0,sizeof(*p)*rem); } static void skc_path_builder_finalize_subblocks(struct skc_path_builder * const path_builder) { // // FIXME -- it might be more performant to zero the remaining // columns in a subblock -- a subblock at a time -- instead of the // same column across all the subblocks // #if 0 while (path_builder->line.rem > 0) { --path_builder->line.rem; *path_builder->line.coords[0]++ = 0.0f; *path_builder->line.coords[1]++ = 0.0f; *path_builder->line.coords[2]++ = 0.0f; *path_builder->line.coords[3]++ = 0.0f; } while (path_builder->quad.rem > 0) { --path_builder->quad.rem; *path_builder->line.coords[0]++ = 0.0f; *path_builder->line.coords[1]++ = 0.0f; *path_builder->line.coords[2]++ = 0.0f; *path_builder->line.coords[3]++ = 0.0f; *path_builder->line.coords[4]++ = 0.0f; *path_builder->line.coords[5]++ = 0.0f; } while (path_builder->cubic.rem > 0) { --path_builder->cubic.rem; *path_builder->line.coords[0]++ = 0.0f; *path_builder->line.coords[1]++ = 0.0f; *path_builder->line.coords[2]++ = 0.0f; *path_builder->line.coords[3]++ = 0.0f; *path_builder->line.coords[4]++ = 0.0f; *path_builder->line.coords[5]++ = 0.0f; *path_builder->line.coords[6]++ = 0.0f; *path_builder->line.coords[7]++ = 0.0f; } #else if (path_builder->line.rem > 0) { skc_zero_float(path_builder->line.coords[0],path_builder->line.rem); skc_zero_float(path_builder->line.coords[1],path_builder->line.rem); skc_zero_float(path_builder->line.coords[2],path_builder->line.rem); skc_zero_float(path_builder->line.coords[3],path_builder->line.rem); path_builder->line.rem = 0; } if (path_builder->quad.rem > 0) { skc_zero_float(path_builder->quad.coords[0],path_builder->quad.rem); skc_zero_float(path_builder->quad.coords[1],path_builder->quad.rem); skc_zero_float(path_builder->quad.coords[2],path_builder->quad.rem); skc_zero_float(path_builder->quad.coords[3],path_builder->quad.rem); skc_zero_float(path_builder->quad.coords[4],path_builder->quad.rem); skc_zero_float(path_builder->quad.coords[5],path_builder->quad.rem); path_builder->quad.rem = 0; } if (path_builder->cubic.rem > 0) { skc_zero_float(path_builder->cubic.coords[0],path_builder->cubic.rem); skc_zero_float(path_builder->cubic.coords[1],path_builder->cubic.rem); skc_zero_float(path_builder->cubic.coords[2],path_builder->cubic.rem); skc_zero_float(path_builder->cubic.coords[3],path_builder->cubic.rem); skc_zero_float(path_builder->cubic.coords[4],path_builder->cubic.rem); skc_zero_float(path_builder->cubic.coords[5],path_builder->cubic.rem); skc_zero_float(path_builder->cubic.coords[6],path_builder->cubic.rem); skc_zero_float(path_builder->cubic.coords[7],path_builder->cubic.rem); path_builder->cubic.rem = 0; } #endif } // // // static void skc_path_builder_impl_unmap(struct skc_path_builder_impl * const impl, skc_uint from, skc_uint to) { // to might be out of range to = to % impl->ring.subbufs; #if 0 fprintf(stderr,"unmap: [%2u,%2u)\n",from,to); #endif while (from != to) // 'to' might be out of range { // bring 'from' back in range from = from % impl->ring.subbufs; struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from; struct skc_subbuffer_cmds * const cmds = impl->cmds .subbufs + from; cl(EnqueueUnmapMemObject(impl->cq, blocks->device, blocks->host, 0,NULL,NULL)); cl(EnqueueUnmapMemObject(impl->cq, cmds->device, cmds->host, 0,NULL,NULL)); // bring from back in range from = ++from % impl->ring.subbufs; } } // // FIXME -- reuse this in create() // static void skc_path_builder_impl_map(struct skc_path_builder_impl * const impl, skc_uint from, skc_uint to) { // to might be out of range to = to % impl->ring.subbufs; #if 0 fprintf(stderr," map: [%2u,%2u)\n",from,to); #endif while (from != to) { cl_int cl_err; struct skc_subbuffer_blocks * const blocks = impl->blocks.subbufs + from; struct skc_subbuffer_cmds * const cmds = impl->cmds .subbufs + from; blocks->host = clEnqueueMapBuffer(impl->cq, blocks->device, CL_FALSE, CL_MAP_WRITE_INVALIDATE_REGION, 0,impl->runtime->config->paths_copy.block.subbuf, 0,NULL,NULL, &cl_err); cl_ok(cl_err); cl(ReleaseEvent(cmds->map)); cmds->host = clEnqueueMapBuffer(impl->cq, cmds->device, CL_FALSE, CL_MAP_WRITE_INVALIDATE_REGION, 0,impl->runtime->config->paths_copy.command.subbuf, 0,NULL,&cmds->map, &cl_err); cl_ok(cl_err); // bring from back in range from = ++from % impl->ring.subbufs; } // // FIXME -- when we switch to out of order queues we'll need a barrier here // } // // // static void skc_path_builder_release_dispose(struct skc_release_record * const release, struct skc_path_builder_impl * const impl) { struct skc_runtime * runtime = impl->runtime; if (release->from <= release->to) // no wrap { skc_path_t const * paths = impl->release.paths + release->from; skc_uint count = release->to - release->from; skc_grid_deps_unmap(runtime->deps,paths,count); skc_runtime_path_device_release(runtime,paths,count); } else // from > to implies wrap { skc_path_t const * paths_lo = impl->release.paths + release->from; skc_uint count_lo = impl->ring.blocks_per.buffer - release->from; skc_grid_deps_unmap(runtime->deps,paths_lo,count_lo); skc_runtime_path_device_release(runtime,paths_lo,count_lo); skc_grid_deps_unmap(runtime->deps,impl->release.paths,release->to); skc_runtime_path_device_release(runtime,impl->release.paths,release->to); } release->to = release->from; } static void skc_path_builder_grid_pfn_dispose(skc_grid_t const grid) { struct skc_release_record * const release = skc_grid_get_data(grid); struct skc_path_builder_impl * const impl = release->impl; skc_path_builder_release_dispose(release,impl); } static void // skc_path_builder_complete(struct skc_release_record * const release) skc_path_builder_complete(skc_grid_t grid) { // // notify deps that this grid is complete enough for other grids to // proceed // // the path builder still has some cleanup to do before all its // resources can be reused // skc_grid_complete(grid); } static void skc_path_builder_paths_copy_cb(cl_event event, cl_int status, skc_grid_t grid) { SKC_CL_CB(status); struct skc_release_record * const release = skc_grid_get_data(grid); SKC_SCHEDULER_SCHEDULE(release->impl->runtime->scheduler,skc_path_builder_complete,grid); } // // // static void skc_path_builder_grid_pfn_waiting(skc_grid_t const grid) { struct skc_release_record * const release = skc_grid_get_data(grid); struct skc_path_builder_impl * const impl = release->impl; // 1. flush incomplete subblocks of path elements // 2. unmap subbuffer on cq.unmap // 3. flush cq.unmap // 4. launch kernel on cq.kernel but wait for unmap completion // 5. flush cq.kernel // 6. remap relevant subbuffers on cq.map but wait for kernel completion // 7. flush cq.map // // FIXME -- can be smarter about flushing if the wip paths are not // in the same subbuf as curr.to // // THIS IS IMPORTANT TO FIX // // flush incomplete subblocks skc_path_builder_finalize_subblocks(impl->path_builder); // // get range of subbufs that need to be unmapped // // note that impl->prev subbufs have already been unmapped // union skc_ringdex_expand curr_from = skc_ringdex_expand(impl,impl->curr.from); union skc_ringdex_expand curr_to = skc_ringdex_expand(impl,impl->curr.to); skc_uint const is_partial = curr_to.block > 0; skc_uint const unmap_to = curr_to.subbuf + is_partial; // // unmap all subbufs in range [from,to) // skc_path_builder_impl_unmap(impl,curr_from.subbuf,unmap_to); // // launch kernels // skc_uint const pb_prev_span = skc_ringdex_span(impl,impl->prev.from,impl->prev.to); skc_uint const pb_curr_span = skc_ringdex_span(impl,impl->curr.from,impl->curr.to); skc_uint const pb_cmds = pb_prev_span + pb_curr_span; // // 1) allocate blocks from pool // // // FIXME -- pack integers into struct/vector // cl(SetKernelArg(impl->kernels.alloc,0,SKC_CL_ARG(impl->runtime->block_pool.atomics.drw))); cl(SetKernelArg(impl->kernels.alloc,1,SKC_CL_ARG(impl->reads))); cl(SetKernelArg(impl->kernels.alloc,2,SKC_CL_ARG(curr_from.subbuf))); cl(SetKernelArg(impl->kernels.alloc,3,SKC_CL_ARG(pb_cmds))); skc_device_enqueue_kernel(impl->runtime->device, SKC_DEVICE_KERNEL_ID_PATHS_ALLOC, impl->cq, impl->kernels.alloc, 1, 0,NULL,NULL); // // 2) copy blocks from unmapped device-accessible memory // // // FIXME -- pack integers into struct/vector and reduce 13 arguments down to 7 // cl(SetKernelArg(impl->kernels.copy, 0,SKC_CL_ARG(impl->runtime->handle_pool.map.drw))); cl(SetKernelArg(impl->kernels.copy, 1,SKC_CL_ARG(impl->runtime->block_pool.ids.drw))); cl(SetKernelArg(impl->kernels.copy, 2,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw))); cl(SetKernelArg(impl->kernels.copy, 3,SKC_CL_ARG(impl->runtime->block_pool.size->ring_mask))); cl(SetKernelArg(impl->kernels.copy, 4,SKC_CL_ARG(impl->reads))); cl(SetKernelArg(impl->kernels.copy, 5,SKC_CL_ARG(curr_from.subbuf))); cl(SetKernelArg(impl->kernels.copy, 6,SKC_CL_ARG(impl->cmds.buffer))); cl(SetKernelArg(impl->kernels.copy, 7,SKC_CL_ARG(impl->blocks.buffer))); cl(SetKernelArg(impl->kernels.copy, 8,SKC_CL_ARG(impl->ring.blocks_per.buffer))); cl(SetKernelArg(impl->kernels.copy, 9,SKC_CL_ARG(impl->prev.rolling))); cl(SetKernelArg(impl->kernels.copy,10,SKC_CL_ARG(impl->prev.from))); cl(SetKernelArg(impl->kernels.copy,11,SKC_CL_ARG(pb_prev_span))); cl(SetKernelArg(impl->kernels.copy,12,SKC_CL_ARG(impl->curr.from))); cl_event complete; skc_device_enqueue_kernel(impl->runtime->device, SKC_DEVICE_KERNEL_ID_PATHS_COPY, impl->cq, impl->kernels.copy, pb_cmds, 0,NULL,&complete); // set a callback on completion cl(SetEventCallback(complete,CL_COMPLETE, skc_path_builder_paths_copy_cb, grid)); // immediately release cl(ReleaseEvent(complete)); // // remap as many subbuffers as possible after the kernel completes // // note that remaps are async and enqueued on the same command queue // as the kernel launch // // we can't remap subbuffers that are in the possibly empty range // // cases: // // - curr.to == wip.to which means no blocks have been acquired // - curr.to points to first block in (next) subbuf // - otherwise, wip acquired blocks in the curr.to subbuf // // check for these first 2 cases! // union skc_ringdex_expand const prev_from = skc_ringdex_expand(impl,impl->prev.from); skc_uint const no_wip = impl->curr.to == impl->wip.to; skc_uint map_to = curr_to.subbuf + (is_partial && no_wip); // remap all subbufs in range [from,to) skc_path_builder_impl_map(impl,prev_from.subbuf,map_to); // flush command queue cl(Flush(impl->cq)); // save rolling impl->prev.rolling = impl->wip.rolling.next; // update prev and curr if (no_wip) { // // if there was no wip then round up to the next subbuf // skc_ringdex_wip_to_subbuf_inc(impl); // // update prev/curr with with incremented wip // impl->prev.from = impl->prev.to = impl->wip.to; impl->curr.from = impl->curr.to = impl->wip.to; } else { // // update prev with wip partials // impl->prev.from = impl->curr.to; impl->prev.to = impl->wip .to; // // start curr on a new subbuf boundary // skc_ringdex_wip_to_subbuf_inc(impl); impl->curr.from = impl->wip.to; impl->curr.to = impl->wip.to; } } // // // static void skc_path_builder_impl_acquire_subbuffer(struct skc_path_builder_impl * const impl, skc_uint const subbuf) { // // FIXME -- move to a power-of-two subbuf size and kickstart path // copies as early as possible // // FIXME -- the subbufs "self-clock" (flow control) the kernel // launches and accounting. Combine all the subbuffers and release // records into a single indexable struct instead of 3. // struct skc_subbuffer_cmds * const sc = impl->cmds.subbufs + subbuf; struct skc_release_record * const release = impl->release.records + subbuf; struct skc_scheduler * const scheduler = impl->runtime->scheduler; // can't proceed until the paths have been released SKC_SCHEDULER_WAIT_WHILE(scheduler,release->from != release->to); // throw in a scheduler yield ... FIXME -- get rid of skc_scheduler_yield(scheduler); // can't proceed until the subbuffer is mapped cl(WaitForEvents(1,&sc->map)); } // // // static union skc_ringdex_expand skc_path_builder_impl_acquire_block(struct skc_path_builder_impl * const impl) { // break ringdex into components union skc_ringdex_expand const to = skc_ringdex_expand(impl,impl->wip.to); // does wip ringdex point to a new subbuffer? if (to.block == 0) { // potentially spin/block waiting for subbuffer skc_path_builder_impl_acquire_subbuffer(impl,to.subbuf); } // post increment wip.to skc_ringdex_wip_to_block_inc(impl); return to; } // // // static skc_uint skc_rolling_block(skc_uint const rolling, skc_uint const tag) { return rolling | tag; } static skc_uint skc_rolling_subblock(skc_uint const rolling, skc_uint const subblock, skc_uint const tag) { return rolling | (subblock << SKC_TAGGED_BLOCK_ID_BITS_TAG) | tag; } static void skc_rolling_inc(struct skc_path_builder_impl * const impl) { impl->wip.rolling.next += impl->wip.rolling.one; } // // // static void * skc_path_builder_impl_new_command(struct skc_path_builder_impl * const impl, skc_uint const rolling, skc_cmd_paths_copy_tag const tag) { // bump blocks count impl->wip.head->header.blocks += 1; // acquire a block union skc_ringdex_expand const to = skc_path_builder_impl_acquire_block(impl); // make a pointer union skc_tagged_block_id * const cmds_subbuf = impl->cmds.subbufs[to.subbuf].host; // store command for block cmds_subbuf[to.block].u32 = skc_rolling_block(rolling,tag); #if 0 // store command for block cmds_subbuf[to.block].u32 = skc_rolling_block(impl->wip.rolling.next,tag); // increment rolling skc_rolling_inc(impl); #endif // return pointer to block float * const blocks_subbuf = impl->blocks.subbufs[to.subbuf].host; // FIXME -- make it easier to get config constant return blocks_subbuf + (to.block * impl->runtime->config->block.words); } // // // static void skc_path_builder_impl_flush_node(struct skc_path_builder_impl * const impl) { // store command to subbuf and get pointer to blocks subbuf void * const block = skc_path_builder_impl_new_command(impl,impl->wip.ids.rolling, SKC_CMD_PATHS_COPY_TAG_NODE); // copy head to blocks subbuf -- write-only memcpy(block,impl->wip.node,impl->runtime->config->block.bytes); } static void skc_path_builder_impl_flush_head(struct skc_path_builder_impl * const impl) { // store command to subbuf and get pointer to blocks subbuf void * const block = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next, SKC_CMD_PATHS_COPY_TAG_HEAD); // copy head to blocks subbuf -- write-only memcpy(block,impl->wip.head,impl->runtime->config->block.bytes); // increment rolling skc_rolling_inc(impl); // the 'to' index is non-inclusive so assign wip.to after flush_head impl->curr.to = impl->wip.to; } // // // static void skc_path_builder_impl_new_node_block(struct skc_path_builder_impl * const impl) { // update final block id in node impl->wip.ids.next->u32 = skc_rolling_block(impl->wip.rolling.next,SKC_BLOCK_ID_TAG_PATH_NEXT); // if wip.ids is not the header then flush now full wip node if (impl->wip.head->header.nodes > 0) skc_path_builder_impl_flush_node(impl); // bump node count impl->wip.head->header.nodes += 1; // save current rolling impl->wip.ids.rolling = impl->wip.rolling.next; // increment rolling skc_rolling_inc(impl); // update wip.ids.* impl->wip.ids.next = impl->wip.node->tag_ids; impl->wip.ids.rem = impl->runtime->config->block.words; } static void skc_path_builder_impl_new_segs_block(struct skc_path_builder_impl * const impl) { impl->wip.subblocks.rem = impl->runtime->config->block.subblocks; // FIXME -- move constants closer to structure impl->wip.subblocks.rolling = impl->wip.rolling.next; impl->wip.subblocks.next = skc_path_builder_impl_new_command(impl,impl->wip.rolling.next, SKC_CMD_PATHS_COPY_TAG_SEGS); impl->wip.subblocks.idx = 0; // increment rolling skc_rolling_inc(impl); } // // // static void skc_path_builder_impl_acquire_subblocks(struct skc_path_builder_impl * const impl, skc_block_id_tag tag, skc_uint vertices, float * * subblocks) { // // FIRST TAG RECORDS THE ELEMENT TYPE // while (true) { // if only one block id left in node then acquire new node block // and append its block id as with a next tag if (impl->wip.ids.rem == 1) skc_path_builder_impl_new_node_block(impl); // if zero subblocks left then acquire a new subblock block and // append its block id if (impl->wip.subblocks.rem == 0) skc_path_builder_impl_new_segs_block(impl); // save first command -- tag and subblocks may have been updated impl->wip.ids.next->u32 = skc_rolling_subblock(impl->wip.subblocks.rolling,impl->wip.subblocks.idx,tag); // increment node block subblock pointer impl->wip.ids.next += 1; impl->wip.ids.rem -= 1; // how many vertices can we store skc_uint rem = min(vertices,impl->wip.subblocks.rem); // decrement vertices vertices -= rem; impl->wip.subblocks.rem -= rem; impl->wip.subblocks.idx += rem; // assign subblocks do { *subblocks++ = impl->wip.subblocks.next; impl->wip.subblocks.next += impl->runtime->config->subblock.words; // FIXME -- move constants closer to structure } while (--rem > 0); // anything left to do? if (vertices == 0) break; // any tag after this will be a caboose command tag = SKC_BLOCK_ID_TAG_PATH_NEXT; } } // // // static void skc_path_builder_pfn_end(struct skc_path_builder_impl * const impl, skc_path_t * const path) { // finalize incomplete active subblocks -- we don't care about any // remaining unused subblocks in block skc_path_builder_finalize_subblocks(impl->path_builder); // mark remaining wips.ids in the head or node as invalid skc_path_builder_impl_finalize_node(impl); // flush node if rem > 0 and node is not actually head if (impl->wip.head->header.nodes >= 1) skc_path_builder_impl_flush_node(impl); // acquire path host id *path = skc_runtime_handle_device_acquire(impl->runtime); // FIXME -- MAY WANT TO GRAB AN ID ON BEGIN // save path host handle impl->wip.head->header.handle = *path; // flush head -- acquires a block and bumps head->header.blocks skc_path_builder_impl_flush_head(impl); // get current release struct skc_release_record * const release = skc_release_curr(impl); // acquire grid if null if (release->grid == NULL) { release->grid = SKC_GRID_DEPS_ATTACH(impl->runtime->deps, &release->grid, // NULL on start/force release, // data payload skc_path_builder_grid_pfn_waiting, NULL, // no execute pfn skc_path_builder_grid_pfn_dispose); } // update grid map skc_grid_map(release->grid,*path); // update path release impl->release.paths[release->to] = *path; // increment release.to release->to = (release->to + 1) % impl->ring.blocks_per.buffer; // add guard bit *path |= SKC_TYPED_HANDLE_TYPE_IS_PATH; #if 1 // // eager kernel launch? // { union skc_ringdex_expand const curr_from = skc_ringdex_expand(impl,impl->curr.from); union skc_ringdex_expand const curr_to = skc_ringdex_expand(impl,impl->curr.to); if (curr_from.subbuf != curr_to.subbuf) { skc_grid_start(release->grid); // skc_scheduler_yield(impl->runtime->scheduler); } } #endif } // // FIXME -- clean up accessing of CONFIG constants in these 3 routines // static void skc_path_builder_pfn_new_line(struct skc_path_builder_impl * const impl) { // acquire subblock pointers skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_LINE,4, impl->path_builder->line.coords); // increment line count impl->wip.head->header.prims += 1; // update rem_count_xxx count impl->path_builder->line.rem = impl->runtime->config->subblock.words; } static void skc_path_builder_pfn_new_quad(struct skc_path_builder_impl * const impl) { // acquire subblock pointers skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_QUAD,6, impl->path_builder->quad.coords); // increment line count impl->wip.head->header.prims += 1; // update rem_count_xxx count impl->path_builder->quad.rem = impl->runtime->config->subblock.words; } static void skc_path_builder_pfn_new_cubic(struct skc_path_builder_impl * const impl) { // acquire subblock pointers skc_path_builder_impl_acquire_subblocks(impl,SKC_BLOCK_ID_TAG_PATH_CUBIC,8, impl->path_builder->cubic.coords); // increment line count impl->wip.head->header.prims += 1; // update rem_count_xxx count impl->path_builder->cubic.rem = impl->runtime->config->subblock.words; } // // // static void skc_path_builder_pfn_release(struct skc_path_builder_impl * const impl) { // decrement reference count if (--impl->path_builder->refcount != 0) return; // // otherwise, dispose of everything // struct skc_runtime * const runtime = impl->runtime; // free path builder skc_runtime_host_perm_free(impl->runtime,impl->path_builder); // release cq skc_runtime_release_cq_in_order(runtime,impl->cq); // release kernels cl(ReleaseKernel(impl->kernels.alloc)); cl(ReleaseKernel(impl->kernels.copy)); // free blocks extents cl(ReleaseMemObject(impl->blocks.buffer)); skc_runtime_host_perm_free(runtime,impl->blocks.subbufs); cl(ReleaseMemObject(impl->cmds.buffer)); skc_runtime_host_perm_free(runtime,impl->cmds.subbufs); // free records skc_runtime_host_perm_free(runtime,impl->release.records); skc_runtime_host_perm_free(runtime,impl->release.paths); // release staging head and node skc_runtime_host_perm_free(runtime,impl->wip.head); skc_runtime_host_perm_free(runtime,impl->wip.node); // release reads scratch array cl(ReleaseMemObject(impl->reads)); // for all subbuffers // unmap subbuffer // release subbuffer // printf("%s not releasing subbuffers\n",__func__); skc_runtime_host_perm_free(impl->runtime,impl); } // // // skc_err skc_path_builder_cl_12_create(struct skc_context * const context, struct skc_path_builder * * const path_builder) { // // retain the context // skc_context_retain(context); // struct skc_runtime * const runtime = context->runtime; // allocate path builder (*path_builder) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**path_builder)); // init state SKC_ASSERT_STATE_INIT((*path_builder),SKC_PATH_BUILDER_STATE_READY); (*path_builder)->context = context; // save opaque impl-specific pointers (*path_builder)->begin = skc_path_builder_pfn_begin; (*path_builder)->end = skc_path_builder_pfn_end; (*path_builder)->new_line = skc_path_builder_pfn_new_line; (*path_builder)->new_quad = skc_path_builder_pfn_new_quad; (*path_builder)->new_cubic = skc_path_builder_pfn_new_cubic; (*path_builder)->release = skc_path_builder_pfn_release; // initialize path builder counts (*path_builder)->line.rem = 0; (*path_builder)->quad.rem = 0; (*path_builder)->cubic.rem = 0; (*path_builder)->refcount = 1; struct skc_path_builder_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl)); (*path_builder)->impl = impl; // // init impl // impl->path_builder = *path_builder; impl->runtime = runtime; impl->cq = skc_runtime_acquire_cq_in_order(runtime); impl->kernels.alloc = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_ALLOC); impl->kernels.copy = skc_device_acquire_kernel(runtime->device,SKC_DEVICE_KERNEL_ID_PATHS_COPY); // // FIXME -- let these config constants remain constant and in place // struct skc_config const * const config = runtime->config; impl->ring.subbufs = config->paths_copy.buffer.count; impl->ring.blocks_per.buffer = config->paths_copy.subbuf.count * config->paths_copy.buffer.count; impl->ring.blocks_per.subbuf = config->paths_copy.subbuf.count; // // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ // cl_int cl_err; // allocate large device-side extent for path data impl->blocks.buffer = clCreateBuffer(runtime->cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, config->paths_copy.block.buffer, // FIXME -- either use config or local constants everywhere NULL,&cl_err); cl_ok(cl_err); // allocate small host-side array of pointers to mapped subbufs impl->blocks.subbufs = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE, impl->ring.subbufs * sizeof(*impl->blocks.subbufs)); // allocate large device-side extent for path copy commands impl->cmds.buffer = clCreateBuffer(runtime->cl.context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, config->paths_copy.command.buffer, NULL,&cl_err); cl_ok(cl_err); // allocate small host-side array of pointers to mapped subbufs impl->cmds.subbufs = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE, impl->ring.subbufs * sizeof(*impl->cmds.subbufs)); // allocate small host-side array of intervals of path handles impl->release.records = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE, impl->ring.subbufs * sizeof(*impl->release.records)); // allocate large host-side array that is max # of path handles in flight impl->release.paths = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE, impl->ring.blocks_per.buffer * sizeof(*impl->release.paths)); // small scratch used by kernels impl->reads = clCreateBuffer(runtime->cl.context, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, sizeof(skc_uint) * impl->ring.subbufs, NULL,&cl_err); cl_ok(cl_err); // initialize release record with impl backpointer for (skc_uint ii=0; iiring.subbufs; ii++) { struct skc_release_record * record = impl->release.records + ii; record->impl = impl; record->grid = NULL; record->from = record->to = ii * impl->ring.blocks_per.subbuf; } // // allocate and map subbuffers -- we always check the command // subbuffer's map/unmap events before touching it or its associated // block subbuffer. // struct skc_subbuffer_blocks * sb = impl->blocks.subbufs; struct skc_subbuffer_cmds * sc = impl->cmds .subbufs; cl_buffer_region rb = { 0, config->paths_copy.block.subbuf }; cl_buffer_region rc = { 0, config->paths_copy.command.subbuf }; // for each subbuffer for (skc_uint ii=0; iipaths_copy.buffer.count; ii++) { sb->device = clCreateSubBuffer(impl->blocks.buffer, CL_MEM_HOST_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &rb, &cl_err); cl_ok(cl_err); sb->host = clEnqueueMapBuffer(impl->cq, sb->device, CL_FALSE, CL_MAP_WRITE_INVALIDATE_REGION, 0,rb.size, 0,NULL,NULL, &cl_err); cl_ok(cl_err); sc->device = clCreateSubBuffer(impl->cmds.buffer, CL_MEM_HOST_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &rc, &cl_err); cl_ok(cl_err); sc->host = clEnqueueMapBuffer(impl->cq, sc->device, CL_FALSE, CL_MAP_WRITE_INVALIDATE_REGION, 0,rc.size, 0,NULL,&sc->map, &cl_err); cl_ok(cl_err); sb += 1; sc += 1; rb.origin += rb.size; rc.origin += rc.size; } // // initialize remaining members // impl->prev.from = 0; impl->prev.to = 0; impl->prev.rolling = 0; impl->curr.from = 0; impl->curr.to = 0; impl->wip.to = 0; impl->wip.head = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes); impl->wip.node = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,config->block.bytes); impl->wip.rolling.one = SKC_BLOCK_ID_TAG_COUNT * config->block.subblocks; impl->wip.rolling.next = 0; // for now, completely initialize builder before returning cl(Finish(impl->cq)); return SKC_ERR_SUCCESS; } // // //