diff options
Diffstat (limited to 'src/compute/skc/platforms/cl_12/raster_builder_cl_12.c')
-rw-r--r-- | src/compute/skc/platforms/cl_12/raster_builder_cl_12.c | 1349 |
1 files changed, 1349 insertions, 0 deletions
diff --git a/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c new file mode 100644 index 0000000000..33992cbdfb --- /dev/null +++ b/src/compute/skc/platforms/cl_12/raster_builder_cl_12.c @@ -0,0 +1,1349 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +// get rid of these +#include <stdio.h> +#include <stdlib.h> + +// +// +// + +#include "hs/cl/hs_cl_launcher.h" + +#include "common/cl/assert_cl.h" + +#include "context.h" +#include "grid.h" +#include "raster.h" +#include "extent_ring.h" +#include "raster_builder.h" + +#include "tile.h" + +#include "config_cl.h" +#include "runtime_cl_12.h" +#include "extent_cl_12.h" +#include "raster_builder_cl_12.h" + +// +// RASTERIZATION SUB-PIPELINE +// -------------------------- +// +// Phase 1: expand commands +// +// Phase 2: rasterize +// +// Phase 3: sort & segment || release paths +// +// Phase 4: prefix +// +// Phase 5: release rasters +// +// RASTER COHORT +// ============== +// +// BUILDER RASTERIZER POST PROCESSING +// <-----------------------------------------------> <------------> <---------------------------------------------------------------------> +// +// fill cmds transforms raster clips path release rasterize cmds cohort map raster release TTSB TTSK cohort atomics context atomics +// --------- ---------- ------------ ------------ -------------- ---------- -------------- ---- ---- -------------- --------------- +// 1,2 1,2 1,2 1,2 2 1-4 1,2,3,4 2-4 2-4 2-4 global +// +// +// NOTES: FINE-GRAINED SVM +// ----------------------- +// +// 1) In a fine-grained system we know the exact number of +// rasterize cmds per segment type before phase 1 +// +// 2) A raster that's "under construction" shouldn't be rasterized +// until it is complete. This implies that a raster is not part +// of a cohort until it is complete. The raster builder must +// handle raster promises being "forced" to completion -- this is +// likely the result of composition construction and subsequent +// rendering to a surface. +// +// 3) The raster cohort rasterizer state retains the fill cmd, +// transform, raster clip and path release "ring" extents. +// +// 4) The rasterize cmd extent sizes (line, quad, cubic, rational +// quad, rational cubic) are known ahead of time. +// +// 5) The raster cohort post processor is standalone and retains the +// raster_map, cohort atomics, TTSK_RYX extent, and raster +// references until complete. +// + +// +// Notes: +// +// - Could have a pipeline stage before expansion count the exact +// number of line/quad/cubic commands but the command buffers are +// relatively small (64-bit commands * # of path segments). +// + +// raster +// cohort atomics path_ids raster_ids transforms clips cmds_fill cmds_l/q/c ttsk_ryx +// +// +// BEGIN ^ +// | +// EXPAND | +// | +// RASTERIZE | +// | +// SORT || RELEASE PATHS | +// | +// PREFIX | +// | +// RELEASE RASTERS | +// | +// END v +// +// +// BEGIN +// +// EXPAND -- PRODUCES: one or more extents of rasterization commands +// +// RASTERIZE -- DEPENDENCY: requires size of command extents before launching +// -- PRODUCES: an extent of ttsk_ryx keys +// +// SORT || RELEASE PATHS -- DEPENDENCY: requires size of key extent before launching +// -- PRODUCES: sorted array of keys +// +// PREFIX -- DEPENDENCY: none -- can execute after SORT because grid size is number of rasters +// +// RELEASE RASTERS -- DEPENDENCY: none -- can execute after prefix +// +// END +// + +// ------------------------ +// +// DEPENDENCY is cleanly implemented with a host callback or device kernel launcher +// +// Can this hide resource acquisition? Yes. But there are two cases: +// +// 1. acqusition of resources occurs on the host thread and lack of +// resources drains the host command queue until resources are +// available (OpenCL 2.x) +// +// 2. the host commands lazily acquire resources (OpenCL 1.2) +// +// ------------------------ +// +// How to express? +// +// Each substage launches its successors. This supports both dependency models. +// +// If OpenCL 1.2 then the substage can't be launched until the prior +// stage's event is complete. So this requires registering a callback +// to invoke the substage. +// +// ------------------------ + +// +// BUILD +// + +struct skc_raster_builder_impl +{ + struct skc_raster_builder * raster_builder; + struct skc_runtime * runtime; + + skc_grid_t cohort; + + // these are all durable/perm extents + struct skc_extent_phrwg_thr1s path_ids; // read/write by host + struct skc_extent_phw1g_tdrNs transforms; // write once by host + read by device + struct skc_extent_phw1g_tdrNs clips; // write once by host + read by device + struct skc_extent_phw1g_tdrNs fill_cmds; // write once by host + read by device + struct skc_extent_phrwg_tdrNs raster_ids; // read/write by host + read by device + + struct { + cl_kernel fills_expand; + cl_kernel rasterize_all; + cl_kernel segment; + cl_kernel rasters_alloc; + cl_kernel prefix; + } kernels; +}; + +// +// RASTER COHORT +// +// This sub-pipeline snapshots the raster builder and then acquires +// and releases host and device resources as necessary (as late as +// possible). +// +// Note that the cohort extents are ephemeral and are only used by one +// or more stages of a the rasterization sub-pipeline. +// +// The pipeline implementation may vary between compute platforms. +// + +struct skc_raster_cohort +{ + struct skc_raster_builder_impl * impl; + + struct skc_extent_phrwg_thr1s_snap path_ids; // read/write by host + struct skc_extent_phw1g_tdrNs_snap transforms; // write once by host + read by device + struct skc_extent_phw1g_tdrNs_snap clips; // write once by host + read by device + struct skc_extent_phw1g_tdrNs_snap fill_cmds; // write once by host + read by device + struct skc_extent_phrwg_tdrNs_snap raster_ids; // read/write by host + read by device + + cl_command_queue cq; + + // sub-pipeline atomics + struct skc_extent_thr_tdrw atomics; + + // path primitives are expanded into line/quad/cubic/rational cmds + struct skc_extent_tdrw cmds; + + // rasterization output + struct skc_extent_tdrw keys; + // struct skc_extent_thrw_tdrw keys; + + // post-sort extent with metadata for each raster + struct skc_extent_tdrw metas; + // struct skc_extent_thrw_tdrw metas; + + // subbuf id + skc_subbuf_id_t id; + + // + // pipeline also uses the following global resources: + // + // - command queue from global factory + // - global block pool and its atomics + // - global path and raster host id map + // - temporary host and device allocations + // +}; + +// +// TTRK (64-BIT COMPARE) +// +// 0 63 +// | TTSB ID | X | Y | COHORT ID | +// +---------+------+------+-----------+ +// | 27 | 12 | 12 | 13 | +// +// +// TTRK (32-BIT COMPARE) +// +// 0 63 +// | TTSB ID | N/A | X | Y | COHORT ID | +// +---------+-----+------+------+-----------+ +// | 27 | 5 | 12 | 12 | 8 | +// + +// +// TTRK is sortable intermediate key format for TTSK +// +// We're going to use the 32-bit comparison version for now +// + +union skc_ttrk +{ + skc_ulong u64; + skc_uint2 u32v2; + + struct { + skc_uint block : SKC_TTXK_LO_BITS_ID; + skc_uint na0 : SKC_TTRK_LO_BITS_NA; + skc_uint x : SKC_TTXK_HI_BITS_X; + skc_uint y : SKC_TTXK_HI_BITS_Y; + skc_uint cohort : SKC_TTRK_HI_BITS_COHORT; + }; + + struct { + skc_uint na1; + skc_uint yx : SKC_TTXK_HI_BITS_YX; + skc_uint na2 : SKC_TTRK_HI_BITS_COHORT; + }; + + struct { + skc_uint na3; + skc_uint na4 : SKC_TTXK_HI_BITS_X; + skc_uint cohort_y : SKC_TTRK_HI_BITS_COHORT_Y; + }; +}; + +// +// +// + +static +void +skc_raster_builder_pfn_release(struct skc_raster_builder_impl * const impl) +{ + // decrement reference count + if (--impl->raster_builder->refcount != 0) + return; + + // + // otherwise, dispose of the the raster builder and its impl + // + struct skc_runtime * const runtime = impl->runtime; + + // free the raster builder + skc_runtime_host_perm_free(runtime,impl->raster_builder); + + // free durable/perm extents + skc_extent_phrwg_thr1s_free(runtime,&impl->path_ids); + skc_extent_phw1g_tdrNs_free(runtime,&impl->transforms); + skc_extent_phw1g_tdrNs_free(runtime,&impl->clips); + skc_extent_phw1g_tdrNs_free(runtime,&impl->fill_cmds); + skc_extent_phrwg_tdrNs_free(runtime,&impl->raster_ids); + + // release kernels + cl(ReleaseKernel(impl->kernels.fills_expand)); + cl(ReleaseKernel(impl->kernels.rasterize_all)); + +#if 0 + cl(ReleaseKernel(impl->kernels.rasterize_lines)); + cl(ReleaseKernel(impl->kernels.rasterize_quads)); + cl(ReleaseKernel(impl->kernels.rasterize_cubics)); +#endif + + cl(ReleaseKernel(impl->kernels.segment)); + cl(ReleaseKernel(impl->kernels.rasters_alloc)); + cl(ReleaseKernel(impl->kernels.prefix)); + + // free the impl + skc_runtime_host_perm_free(runtime,impl); +} + +// +// +// + +static +void +skc_raster_builder_rasters_release(struct skc_runtime * const runtime, + skc_raster_t const * const rasters, + skc_uint const size, + skc_uint const from, + skc_uint const to) +{ + if (from <= to) // no wrap + { + skc_raster_t const * rasters_from = rasters + from; + skc_uint count_from = to - from; + + skc_grid_deps_unmap(runtime->deps,rasters_from,count_from); + skc_runtime_raster_device_release(runtime,rasters_from,count_from); + } + else // from > to implies wrap + { + skc_raster_t const * rasters_lo = rasters + from; + skc_uint count_lo = size - from; + + skc_grid_deps_unmap(runtime->deps,rasters_lo,count_lo); + skc_runtime_raster_device_release(runtime,rasters_lo,count_lo); + + skc_grid_deps_unmap(runtime->deps,rasters,to); + skc_runtime_raster_device_release(runtime,rasters,to); + } +} + +static +void +skc_raster_builder_paths_release(struct skc_runtime * const runtime, + struct skc_extent_phrwg_thr1s_snap * const snap) +{ + // release lo + skc_runtime_path_device_release(runtime,snap->hr1.lo,snap->count.lo); + + // release hi + if (snap->count.hi) + skc_runtime_path_device_release(runtime,snap->hr1.hi,snap->count.hi); +} + +static +void +skc_raster_builder_cohort_grid_pfn_dispose(skc_grid_t const grid) +{ + // + // ALLOCATED RESOURCES + // + // path_ids - + // raster_ids a + // transforms - + // clips - + // fill_cmds - + // cq a + // cohort atomics a + // cmds - + // keys a + // meta a + // + + struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); + struct skc_raster_builder_impl * const impl = cohort->impl; + struct skc_runtime * const runtime = impl->runtime; + + // + // release paths -- FIXME -- Note that releasing paths can be + // performed after rasterization is complete + // + + // snap alloc the paths -- this host snap simply sets up pointers + skc_extent_phrwg_thr1s_snap_alloc(runtime,&impl->path_ids,&cohort->path_ids); + + // unmap and release raster ids + skc_raster_builder_paths_release(runtime,&cohort->path_ids); + + // release path ids + skc_extent_phrwg_thr1s_snap_free(runtime,&cohort->path_ids); + + // + // release rasters + // + skc_uint const size = cohort->raster_ids.snap->ring->size.pow2; + skc_uint const from = skc_extent_ring_snap_from(cohort->raster_ids.snap); + skc_uint const to = skc_extent_ring_snap_to(cohort->raster_ids.snap); + + // unmap and release raster ids + skc_raster_builder_rasters_release(runtime,impl->raster_ids.hrw,size,from,to); + + // release cohort's remaining allocated resources + skc_extent_phrwg_tdrNs_snap_free(runtime,&cohort->raster_ids); + skc_runtime_release_cq_in_order(runtime,cohort->cq); + skc_extent_thr_tdrw_free(runtime,&cohort->atomics); + skc_extent_tdrw_free(runtime,&cohort->keys); + skc_extent_tdrw_free(runtime,&cohort->metas); + // skc_extent_thrw_tdrw_free(runtime,&cohort->keys); + // skc_extent_thrw_tdrw_free(runtime,&cohort->metas); + skc_runtime_host_temp_free(runtime,cohort,cohort->id); + + // release the raster builder + skc_raster_builder_pfn_release(impl); + + // + // ALLOCATED RESOURCES + // + // path_ids - + // raster_ids - + // transforms - + // clips - + // fill_cmds - + // cq - + // cohort atomics - + // cmds - + // keys - + // meta - + // +} + +// +// +// + +static +void +skc_raster_cohort_prefix_release(skc_grid_t const grid) +{ + // FIXME -- note that pfn_dispose can be accomplished here + + // release the grid + skc_grid_complete(grid); +} + +static +void +skc_raster_cohort_prefix_cb(cl_event event, cl_int status, skc_grid_t const grid) +{ + SKC_CL_CB(status); + + struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); + struct skc_scheduler * const scheduler = cohort->impl->runtime->scheduler; + + // as quickly as possible, enqueue next stage in pipeline to context command scheduler + SKC_SCHEDULER_SCHEDULE(scheduler,skc_raster_cohort_prefix_release,grid); +} + +// +// +// + +#if 0 +static +int cmp64(const void * ptr_a, const void * ptr_b) +{ + skc_ulong const a = *(const skc_ulong *)ptr_a; + skc_ulong const b = *(const skc_ulong *)ptr_b; + + if (a < b) return -1; + if (a > b) return +1; + else return 0; +} +#endif + +// +// +// + +static +void +skc_raster_cohort_sort_prefix(skc_grid_t const grid) +{ + // + // ALLOCATED RESOURCES + // + // path_ids i + // raster_ids i + // transforms a + // clips a + // fill_cmds - + // cq a + // cohort atomics a + // cmds a + // keys a + // meta - + // + + // use the backpointers + struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); + struct skc_raster_builder_impl * const impl = cohort->impl; + struct skc_runtime * const runtime = impl->runtime; + + // release transforms + skc_extent_phw1g_tdrNs_snap_free(runtime,&cohort->transforms); + + // release clips + skc_extent_phw1g_tdrNs_snap_free(runtime,&cohort->clips); + + // release expanded cmds + skc_extent_tdrw_free(runtime,&cohort->cmds); + + // alloc the snapshost -- could be zero-sized + skc_extent_phrwg_tdrNs_snap_alloc(runtime, + &impl->raster_ids, + &cohort->raster_ids, + cohort->cq,NULL); + + // will never be zero + skc_uint const rasters = skc_extent_ring_snap_count(cohort->raster_ids.snap); + + // acquire fixed-size device-side extent + skc_extent_tdrw_alloc(runtime, + &cohort->metas, + sizeof(struct skc_raster_cohort_meta)); + + // skc_extent_thrw_tdrw_alloc(runtime, + // &cohort->metas, + // sizeof(struct skc_raster_cohort_meta)); + + // zero the metas + skc_extent_tdrw_zero(&cohort->metas,cohort->cq,NULL); + + // get the read-only host copy of the device atomics + struct skc_raster_cohort_atomic const * const atomics = cohort->atomics.hr; + + // + // SORT + // + if (atomics->keys > 0) + { +#ifndef NDEBUG + fprintf(stderr,"raster cohort sort: %u\n",atomics->keys); +#endif + + // + // + // + uint32_t keys_padded_in, keys_padded_out; + + hs_pad(atomics->keys,&keys_padded_in,&keys_padded_out); + + hs_sort(cohort->cq, + cohort->keys.drw, + cohort->keys.drw, + atomics->keys, + keys_padded_in, + keys_padded_out, + false); + + cl(SetKernelArg(impl->kernels.segment,0,SKC_CL_ARG(cohort->keys.drw))); + cl(SetKernelArg(impl->kernels.segment,1,SKC_CL_ARG(cohort->metas.drw))); + +#ifndef NDEBUG + fprintf(stderr,"post-sort\n"); +#endif + + // find start of each tile + skc_device_enqueue_kernel(runtime->device, + SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK, + cohort->cq, + impl->kernels.segment, + atomics->keys, + 0,NULL,NULL); + +#ifndef NDEBUG + fprintf(stderr,"post-segment\n"); +#endif + + // + // DELETE ALL THIS WHEN READY + // + +#if 0 + // + // + // + cl(Finish(cohort->cq)); + + // map keys to host + union skc_ttrk * const keys = skc_extent_thrw_tdrw_map(&cohort->keys, + cohort->cq, + NULL); + // map meta to host + struct skc_raster_cohort_meta * const metas = skc_extent_thrw_tdrw_map(&cohort->metas, + cohort->cq, + NULL); + // block until done + cl(Finish(cohort->cq)); + + // sort keys + qsort(keys,atomics->keys,sizeof(*keys),cmp64); + + // mask to determine if rk id is a new block + skc_uint const subblock_mask = runtime->config->block.subblocks - 1; + + // + // some counters + // + union skc_raster_cohort_meta_in meta_in = { + .blocks = 0, + .offset = 0, + .pk = 0, + .rk = 0 + }; + + // get first key + union skc_ttrk curr = keys[0]; + + skc_uint ii=0, jj=0; + + // for all TTRK keys + while (true) + { + // increment ttrk count + meta_in.rk += 1; + + // was this a new block? + if ((curr.u32v2.lo & subblock_mask) == 0) + meta_in.blocks += 1; + + // break if we're out of keys + if (++ii >= atomics->keys) + break; + + // otherwise, process next key + union skc_ttrk const next = keys[ii]; + + // if new cohort then save curr meta and init next meta + if (next.cohort != curr.cohort) + { + fprintf(stderr,"[ %u, %u, %u, %u ]\n", + meta_in.blocks, + meta_in.offset, + meta_in.pk, + meta_in.rk); + + // store back to buffer + metas->inout[curr.cohort].in = meta_in; + + // update meta_in + meta_in.blocks = 0; + meta_in.offset = ii; + meta_in.pk = 0; + meta_in.rk = 0; + } + // otherwise, if same y but new x then increment TTPK count + else if ((next.y == curr.y) && (next.x != curr.x)) + { + meta_in.pk += 1; + +#if 0 + fprintf(stderr,"%3u : %3u : ( %3u, %3u ) -> ( %3u )\n", + jj++,curr.cohort,curr.y,curr.x,next.x); +#endif + } + +#if 0 + fprintf(stderr,"( %3u, %3u )\n",next.y,next.x); +#endif + + curr = next; + } + + fprintf(stderr,"[ %u, %u, %u, %u ]\n", + meta_in.blocks, + meta_in.offset, + meta_in.pk, + meta_in.rk); + + // store back to buffer + metas->inout[curr.cohort].in = meta_in; + + + // unmap + skc_extent_thrw_tdrw_unmap(&cohort->keys, + keys, + cohort->cq, + NULL); + + // unmap + skc_extent_thrw_tdrw_unmap(&cohort->metas, + metas, + cohort->cq, + NULL); +#endif + } + +#ifndef NDEBUG + fprintf(stderr,"rasters_alloc: %u\n",rasters); +#endif + + // + // RASTER ALLOC/INIT + // + cl(SetKernelArg(impl->kernels.rasters_alloc,0,SKC_CL_ARG(runtime->block_pool.atomics.drw))); + cl(SetKernelArg(impl->kernels.rasters_alloc,1,SKC_CL_ARG(runtime->block_pool.ids.drw))); + cl(SetKernelArg(impl->kernels.rasters_alloc,2,SKC_CL_ARG(runtime->block_pool.size->ring_mask))); + cl(SetKernelArg(impl->kernels.rasters_alloc,3,SKC_CL_ARG(runtime->handle_pool.map.drw))); + cl(SetKernelArg(impl->kernels.rasters_alloc,4,SKC_CL_ARG(cohort->metas.drw))); + cl(SetKernelArg(impl->kernels.rasters_alloc,5,SKC_CL_ARG(cohort->raster_ids.drN))); + cl(SetKernelArg(impl->kernels.rasters_alloc,6,SKC_CL_ARG(rasters))); + + skc_device_enqueue_kernel(runtime->device, + SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC, + cohort->cq, + impl->kernels.rasters_alloc, + rasters, + 0,NULL,NULL); + +#ifndef NDEBUG + fprintf(stderr,"post-alloc\n"); +#endif + + // + // PREFIX + // + cl(SetKernelArg(impl->kernels.prefix,0,SKC_CL_ARG(runtime->block_pool.atomics.drw))); + cl(SetKernelArg(impl->kernels.prefix,1,SKC_CL_ARG(runtime->block_pool.ids.drw))); + cl(SetKernelArg(impl->kernels.prefix,2,SKC_CL_ARG(runtime->block_pool.blocks.drw))); + cl(SetKernelArg(impl->kernels.prefix,3,SKC_CL_ARG(runtime->block_pool.size->ring_mask))); + + cl(SetKernelArg(impl->kernels.prefix,4,SKC_CL_ARG(cohort->keys.drw))); + cl(SetKernelArg(impl->kernels.prefix,5,SKC_CL_ARG(runtime->handle_pool.map.drw))); + + cl(SetKernelArg(impl->kernels.prefix,6,SKC_CL_ARG(cohort->metas.drw))); + cl(SetKernelArg(impl->kernels.prefix,7,SKC_CL_ARG(rasters))); + + cl_event complete; + + skc_device_enqueue_kernel(runtime->device, + SKC_DEVICE_KERNEL_ID_PREFIX, + cohort->cq, + impl->kernels.prefix, + rasters, + 0,NULL, + &complete); + + cl(SetEventCallback(complete,CL_COMPLETE,skc_raster_cohort_prefix_cb,grid)); + cl(ReleaseEvent(complete)); + +#ifndef NDEBUG + fprintf(stderr,"post-prefix\n"); +#endif + + // flush command queue + cl(Flush(cohort->cq)); + + // + // ALLOCATED RESOURCES + // + // path_ids a + // raster_ids a + // transforms - + // clips - + // fill_cmds - + // cq a + // cohort atomics a + // cmds - + // keys a + // meta a + // +} + +static +void +skc_raster_cohort_rasterize_cb(cl_event event, cl_int status, skc_grid_t const grid) +{ + SKC_CL_CB(status); + + struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); + + // as quickly as possible, enqueue next stage in pipeline to context command scheduler + SKC_SCHEDULER_SCHEDULE(cohort->impl->runtime->scheduler,skc_raster_cohort_sort_prefix,grid); +} + +static +void +skc_raster_cohort_rasterize(skc_grid_t const grid) +{ + // + // ALLOCATED RESOURCES + // + // path_ids i + // raster_ids i + // transforms i + // clips i + // fill_cmds s + // cq a + // cohort atomics a + // cmds a + // cmds_quad a + // cmds_cubic a + // keys - + // meta - + + // use the backpointers + struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); + struct skc_raster_builder_impl * const impl = cohort->impl; + struct skc_runtime * const runtime = impl->runtime; + + // + // RELEASED RESOURCES + // + // cmds snap + // + + // release the cmds extent and snap since it's only used by the expand stage + skc_extent_phw1g_tdrNs_snap_free(runtime,&cohort->fill_cmds); + + // + // NEW ALLOCATED RESOURCES + // + // transforms snap + // clips snap + // ttrk keys + // + skc_extent_phw1g_tdrNs_snap_alloc(runtime, + &impl->transforms, + &cohort->transforms, + cohort->cq,NULL); + + skc_extent_phw1g_tdrNs_snap_alloc(runtime, + &impl->clips, + &cohort->clips, + cohort->cq,NULL); + + // acquire device-side extent + skc_extent_tdrw_alloc(runtime, + &cohort->keys, + sizeof(union skc_ttrk) * runtime->config->raster_cohort.rasterize.keys); + + // skc_extent_thrw_tdrw_alloc(runtime, + // &cohort->keys, + // sizeof(union skc_ttrk) * runtime->config->raster_cohort.rasterize.keys); + + // + // acquire out-of-order command queue + // + // and launch up to 3 kernels + // + // for each kernel: + // + // set runtime "global" kernel args: + // + // - block pool atomics + // - block pool extent + // + // set cohort "local" kernel args: + // + // - atomics + // - cmds + // + // enqueue barrier + // enqueue copy back of atomics on the command queue + // set callback on copy back event + // release command queue + // + struct skc_raster_cohort_atomic const * const atomics = cohort->atomics.hr; + + if (atomics->cmds > 0) + { + cl(SetKernelArg(impl->kernels.rasterize_all,0,SKC_CL_ARG(runtime->block_pool.atomics.drw))); + cl(SetKernelArg(impl->kernels.rasterize_all,1,SKC_CL_ARG(runtime->block_pool.blocks.drw))); + cl(SetKernelArg(impl->kernels.rasterize_all,2,SKC_CL_ARG(runtime->block_pool.ids.drw))); + cl(SetKernelArg(impl->kernels.rasterize_all,3,SKC_CL_ARG(runtime->block_pool.size->ring_mask))); + + cl(SetKernelArg(impl->kernels.rasterize_all,4,SKC_CL_ARG(cohort->atomics.drw))); + cl(SetKernelArg(impl->kernels.rasterize_all,5,SKC_CL_ARG(cohort->keys.drw))); + + cl(SetKernelArg(impl->kernels.rasterize_all,6,SKC_CL_ARG(cohort->transforms.drN))); + cl(SetKernelArg(impl->kernels.rasterize_all,7,SKC_CL_ARG(cohort->clips.drN))); + cl(SetKernelArg(impl->kernels.rasterize_all,8,SKC_CL_ARG(cohort->cmds.drw))); + cl(SetKernelArg(impl->kernels.rasterize_all,9,SKC_CL_ARG(atomics->cmds))); + + skc_device_enqueue_kernel(runtime->device, + SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL, + cohort->cq, + impl->kernels.rasterize_all, + atomics->cmds, + 0,NULL,NULL); + } + + // + // copyback number of TTSK keys + // + cl_event complete; + + skc_extent_thr_tdrw_read(&cohort->atomics,cohort->cq,&complete); + + cl(SetEventCallback(complete,CL_COMPLETE,skc_raster_cohort_rasterize_cb,grid)); + cl(ReleaseEvent(complete)); + + // flush command queue + cl(Flush(cohort->cq)); + + // + // ALLOCATED RESOURCES + // + // path_ids i + // raster_ids i + // transforms a + // clips a + // fill_cmds - + // cq a + // cohort atomics a + // cmds a + // keys a + // meta - +} + +static +void +skc_raster_cohort_fills_expand_cb(cl_event event, cl_int status, skc_grid_t const grid) +{ + SKC_CL_CB(status); + + struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); + + // as quickly as possible, enqueue next stage in pipeline to context command scheduler + SKC_SCHEDULER_SCHEDULE(cohort->impl->runtime->scheduler,skc_raster_cohort_rasterize,grid); +} + +static +void +skc_raster_builder_cohort_grid_pfn_execute(skc_grid_t const grid) +{ + // + // ALLOCATED RESOURCES + // + // path_ids i + // raster_ids i + // transforms i + // clips i + // fill_cmds i + // cq - + // cohort atomics - + // cmds - + // keys - + // meta - + // + + // allocate the cohort + struct skc_raster_cohort * const cohort = skc_grid_get_data(grid); + + // get impl + struct skc_raster_builder_impl * const impl = cohort->impl; + struct skc_runtime * const runtime = impl->runtime; + + // acquire in-order cq + cohort->cq = skc_runtime_acquire_cq_in_order(runtime); + + // alloc the snapshot -- could be zero-sized + skc_extent_phw1g_tdrNs_snap_alloc(runtime, + &impl->fill_cmds, + &cohort->fill_cmds, + cohort->cq,NULL); + + // flush the cq to get the fill running + // cl(Flush(cohort->cq)); + + // create split atomics + skc_extent_thr_tdrw_alloc(runtime,&cohort->atomics,sizeof(struct skc_raster_cohort_atomic)); + + // zero the atomics + skc_extent_thr_tdrw_zero(&cohort->atomics,cohort->cq,NULL); + + // get config + struct skc_config const * const config = runtime->config; + + // acquire device-side extents + skc_extent_tdrw_alloc(runtime, + &cohort->cmds, + sizeof(union skc_cmd_rasterize) * config->raster_cohort.expand.cmds); + + // + // FILLS EXPAND + // + // need result of cmd counts before launching RASTERIZE grids + // + // - OpenCL 1.2: copy atomic counters back to host and launch RASTERIZE grids from host + // - OpenCL 2.x: have a kernel size and launch RASTERIZE grids from device + // - or launch a device-wide grid that feeds itself but that's unsatisfying + // + + // how many commands? could be zero + skc_uint const work_size = skc_extent_ring_snap_count(cohort->fill_cmds.snap); + + if (work_size > 0) + { + cl(SetKernelArg(impl->kernels.fills_expand,0,SKC_CL_ARG(impl->runtime->block_pool.blocks.drw))); + cl(SetKernelArg(impl->kernels.fills_expand,1,SKC_CL_ARG(cohort->atomics.drw))); + cl(SetKernelArg(impl->kernels.fills_expand,2,SKC_CL_ARG(runtime->handle_pool.map.drw))); + cl(SetKernelArg(impl->kernels.fills_expand,3,SKC_CL_ARG(cohort->fill_cmds.drN))); + cl(SetKernelArg(impl->kernels.fills_expand,4,SKC_CL_ARG(cohort->cmds.drw))); + + skc_device_enqueue_kernel(runtime->device, + SKC_DEVICE_KERNEL_ID_FILLS_EXPAND, + cohort->cq, + impl->kernels.fills_expand, + work_size, + 0,NULL,NULL); + } + + // + // copyback number of rasterization commands + // + cl_event complete; + + skc_extent_thr_tdrw_read(&cohort->atomics,cohort->cq,&complete); + + cl(SetEventCallback(complete,CL_COMPLETE,skc_raster_cohort_fills_expand_cb,grid)); + cl(ReleaseEvent(complete)); + + // flush command queue + cl(Flush(cohort->cq)); + + // + // ALLOCATED RESOURCES + // + // path_ids i + // raster_ids i + // transforms i + // clips i + // fill_cmds s + // cq a + // cohort atomics a + // cmds a + // keys - + // meta - + // +} + +// +// move grid into waiting state +// +// this entails allocating a cohort from the temporary extent +// + +static +void +skc_raster_builder_cohort_grid_pfn_waiting(skc_grid_t const grid) +{ + // get the impl + struct skc_raster_builder_impl * const impl = skc_grid_get_data(grid); + struct skc_runtime * const runtime = impl->runtime; + + // retain the raster builder + impl->raster_builder->refcount += 1; + + // allocate the ephemeral/temp cohort + skc_subbuf_id_t id; + + struct skc_raster_cohort * const cohort = + skc_runtime_host_temp_alloc(runtime, + SKC_MEM_FLAGS_READ_WRITE, + sizeof(*cohort), + &id, + NULL); + + // save the id and backpointer + cohort->id = id; + cohort->impl = impl; + + // set grid data -- replaces impl + skc_grid_set_data(grid,cohort); + + // + // ACQUIRE RESOURCES FOR THE COHORT + // + + struct skc_raster_builder * const raster_builder = impl->raster_builder; + + // immediately take snapshots of all rings -- these are very inexpensive operations + skc_extent_phrwg_thr1s_snap_init(runtime,&raster_builder->path_ids .ring,&cohort->path_ids); + skc_extent_phw1g_tdrNs_snap_init(runtime,&raster_builder->transforms.ring,&cohort->transforms); + skc_extent_phw1g_tdrNs_snap_init(runtime,&raster_builder->clips .ring,&cohort->clips); + skc_extent_phw1g_tdrNs_snap_init(runtime,&raster_builder->fill_cmds .ring,&cohort->fill_cmds); + skc_extent_phrwg_tdrNs_snap_init(runtime,&raster_builder->raster_ids.ring,&cohort->raster_ids); + + // + // ALLOCATED RESOURCES + // + // path_ids i + // raster_ids i + // transforms i + // clips i + // fill_cmds i + // cq - + // cohort atomics - + // cmds - + // keys - + // meta - + // +} + +// +// +// + +static +void +skc_raster_builder_cohort_create(struct skc_raster_builder_impl * const impl) +{ + // attach a grid + impl->cohort = SKC_GRID_DEPS_ATTACH(impl->runtime->deps, + &impl->cohort, + impl, + skc_raster_builder_cohort_grid_pfn_waiting, + skc_raster_builder_cohort_grid_pfn_execute, + skc_raster_builder_cohort_grid_pfn_dispose); +} + +// +// +// + +static +skc_err +skc_raster_builder_pfn_add(struct skc_raster_builder_impl * const impl, + skc_path_t const * paths, + skc_uint count) +{ + // validate and retain the path + skc_err err; + + err = skc_runtime_handle_device_validate_retain(impl->runtime, + SKC_TYPED_HANDLE_TYPE_IS_PATH, + paths, + count); + + if (err) + return err; + + skc_runtime_handle_device_retain(impl->runtime,paths,count); + + // make sure there is a grid + if (impl->cohort == NULL) { + skc_raster_builder_cohort_create(impl); + } + + // declare rasterization grid happens after path + while (count-- > 0) + skc_grid_happens_after_handle(impl->cohort,SKC_TYPED_HANDLE_TO_HANDLE(*paths++)); + + return SKC_ERR_SUCCESS; +} + +// +// +// + +static +void +skc_raster_builder_pfn_end(struct skc_raster_builder_impl * const impl, skc_raster_t * const raster) +{ + // + // acquire host-managed path raster handle and bump reference count + // to 2 handles will be released (reduced to 1) once the rasters are + // completely rasterized + // + *raster = skc_runtime_handle_device_acquire(impl->runtime); + + // make sure there is a grid + if (impl->cohort == NULL) { + skc_raster_builder_cohort_create(impl); + } + + // map a handle to a grid + skc_grid_map(impl->cohort,*raster); +} + +// +// snapshot the ring and lazily start the grid +// +// FIXME -- might want to revisit this and settle on an even more +// opaque implementation. Some options: +// +// - never let the SKC API expose a forced grid start +// - make snapshots kick off a forced grid start +// - be lazy all the time everywhere +// + +static +void +skc_raster_builder_pfn_start(struct skc_raster_builder_impl * const impl) +{ + skc_grid_t const cohort = impl->cohort; + + if (cohort != NULL) { + skc_grid_start(cohort); + } +} + +// +// NOTE: THIS MIGHT BE REMOVED +// + +static +void +skc_raster_builder_pfn_force(struct skc_raster_builder_impl * const impl) +{ + skc_grid_t const cohort = impl->cohort; + + if (cohort != NULL) { + skc_grid_force(cohort); + } +} + +// +// +// + +skc_err +skc_raster_builder_cl_12_create(struct skc_context * const context, + struct skc_raster_builder * * const raster_builder) +{ + struct skc_runtime * const runtime = context->runtime; + + // allocate raster builder + (*raster_builder) = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(**raster_builder)); + + // refcount + (*raster_builder)->refcount = 1; + + // state + SKC_ASSERT_STATE_INIT((*raster_builder),SKC_RASTER_BUILDER_STATE_READY); + + // allocate runtime raster builder + struct skc_raster_builder_impl * const impl = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*impl)); + + // save the impl + (*raster_builder)->impl = impl; + + // intialize impl + impl->raster_builder = (*raster_builder); + impl->runtime = runtime; + impl->cohort = NULL; + + // get config + struct skc_config const * const config = runtime->config; + + skc_extent_phrwg_thr1s_alloc(runtime,&impl->path_ids ,sizeof(skc_path_t ) * config->raster_cohort.path_ids .elem_count); + skc_extent_phw1g_tdrNs_alloc(runtime,&impl->transforms,sizeof(union skc_transform) * config->raster_cohort.transforms.elem_count); + skc_extent_phw1g_tdrNs_alloc(runtime,&impl->clips ,sizeof(union skc_path_clip) * config->raster_cohort.clips .elem_count); + skc_extent_phw1g_tdrNs_alloc(runtime,&impl->fill_cmds ,sizeof(union skc_cmd_fill ) * config->raster_cohort.fill .elem_count); + skc_extent_phrwg_tdrNs_alloc(runtime,&impl->raster_ids,sizeof(skc_raster_t ) * config->raster_cohort.raster_ids.elem_count); + + // retain the context + //skc_context_retain(context); + + (*raster_builder)->context = context; + + (*raster_builder)->add = skc_raster_builder_pfn_add; + (*raster_builder)->end = skc_raster_builder_pfn_end; + (*raster_builder)->start = skc_raster_builder_pfn_start; + (*raster_builder)->force = skc_raster_builder_pfn_force; + (*raster_builder)->release = skc_raster_builder_pfn_release; + + // initialize raster builder with host-writable buffers + (*raster_builder)->path_ids .extent = impl->path_ids.hrw; + (*raster_builder)->transforms.extent = impl->transforms.hw1; + (*raster_builder)->clips .extent = impl->clips.hw1; + (*raster_builder)->fill_cmds .extent = impl->fill_cmds.hw1; + (*raster_builder)->raster_ids.extent = impl->raster_ids.hrw; + + // + // the rings perform bookkeeping on the extents + // + // the ring snapshotting and checkpointing are necessary because + // another part of the API can _force_ the raster cohort to flush + // its work-in-progress commands but only up to a checkpointed + // boundary + // + skc_extent_ring_init(&(*raster_builder)->path_ids.ring, + config->raster_cohort.path_ids.elem_count, + config->raster_cohort.path_ids.snap_count, + sizeof(skc_path_t)); + + skc_extent_ring_init(&(*raster_builder)->transforms.ring, + config->raster_cohort.transforms.elem_count, + config->raster_cohort.transforms.snap_count, + sizeof(union skc_transform)); + + skc_extent_ring_init(&(*raster_builder)->clips.ring, + config->raster_cohort.clips.elem_count, + config->raster_cohort.clips.snap_count, + sizeof(union skc_path_clip)); + + skc_extent_ring_init(&(*raster_builder)->fill_cmds.ring, + config->raster_cohort.fill.elem_count, + config->raster_cohort.fill.snap_count, + sizeof(union skc_cmd_fill)); + + skc_extent_ring_init(&(*raster_builder)->raster_ids.ring, + config->raster_cohort.raster_ids.elem_count, + config->raster_cohort.raster_ids.snap_count, + sizeof(skc_raster_t)); + + // + // acquire kernels + // + impl->kernels.fills_expand = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_FILLS_EXPAND); + impl->kernels.rasterize_all = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL); + +#if 0 + impl->kernels.rasterize_lines = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_LINES); + impl->kernels.rasterize_quads = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_QUADS); + impl->kernels.rasterize_cubics = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERIZE_CUBICS); +#endif + + impl->kernels.segment = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK); + impl->kernels.rasters_alloc = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC); + impl->kernels.prefix = skc_device_acquire_kernel(runtime->device, SKC_DEVICE_KERNEL_ID_PREFIX); + + return SKC_ERR_SUCCESS; +} + +// +// +// |