aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c')
-rw-r--r--src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c938
1 files changed, 938 insertions, 0 deletions
diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c
new file mode 100644
index 0000000000..aebe8fdc1d
--- /dev/null
+++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c
@@ -0,0 +1,938 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "common/cl/assert_cl.h"
+
+#include "tile.h"
+#include "raster.h"
+#include "macros.h"
+
+#include "config_cl.h"
+#include "runtime_cl_12.h"
+
+#include "device_cl_12.h"
+
+#include "hs/cl/hs_cl_launcher.h"
+#include "hs/cl/gen9/hs_cl.h"
+
+//
+//
+//
+
+#define SKC_KERNEL_SPIRV 0
+#define SKC_KERNEL_BINARY 1
+#define SKC_KERNEL_SRC 0
+
+//
+//
+//
+
+#if SKC_KERNEL_SPIRV
+
+#include "inl/block_pool_init.pre.spv.inl"
+#include "inl/paths_copy.pre.spv.inl"
+#include "inl/fills_expand.pre.spv.inl"
+#include "inl/rasterize.pre.spv.inl"
+#include "inl/segment_ttrk.pre.spv.inl"
+#include "inl/rasters_alloc.pre.spv.inl"
+#include "inl/prefix.pre.spv.inl"
+#include "inl/place.pre.spv.inl"
+#include "inl/segment_ttck.pre.spv.inl"
+#include "inl/render.pre.spv.inl"
+#include "inl/paths_reclaim.pre.spv.inl"
+#include "inl/rasters_reclaim.pre.spv.inl"
+
+#elif SKC_KERNEL_BINARY
+
+#include "inl/block_pool_init.pre.bin.inl"
+#include "inl/paths_copy.pre.bin.inl"
+#include "inl/fills_expand.pre.bin.inl"
+#include "inl/rasterize.pre.bin.inl"
+#include "inl/segment_ttrk.pre.bin.inl"
+#include "inl/rasters_alloc.pre.bin.inl"
+#include "inl/prefix.pre.bin.inl"
+#include "inl/place.pre.bin.inl"
+#include "inl/segment_ttck.pre.bin.inl"
+#include "inl/render.pre.bin.inl"
+#include "inl/paths_reclaim.pre.bin.inl"
+#include "inl/rasters_reclaim.pre.bin.inl"
+
+#elif SKC_KERNEL_SRC
+
+#include "inl/block_pool_init.pre.src.inl"
+#include "inl/paths_copy.pre.src.inl"
+#include "inl/fills_expand.pre.src.inl"
+#include "inl/rasterize.pre.src.inl"
+#include "inl/segment_ttrk.pre.src.inl"
+#include "inl/rasters_alloc.pre.src.inl"
+#include "inl/prefix.pre.src.inl"
+#include "inl/place.pre.src.inl"
+#include "inl/segment_ttck.pre.src.inl"
+#include "inl/render.pre.src.inl"
+#include "inl/paths_reclaim.pre.src.inl"
+#include "inl/rasters_reclaim.pre.src.inl"
+
+#endif
+
+//
+// FIXME -- THE CONFIG INITIALIZATION IS ONLY HERE TEMPORARILY
+//
+
+static
+struct skc_config const config =
+ {
+ .suballocator = {
+ .host = {
+ .size = 1024 * 1024, // words
+ .subbufs = 1024 // must be <= (1 << (8 * sizeof(skc_subbuf_id_t)))
+ },
+ .device = {
+ .size = 128 * 1024 * 1024,
+ .subbufs = 1024 // must be <= (1 << (8 * sizeof(skc_subbuf_id_t)))
+ }
+ },
+
+ .scheduler = {
+ .size = 4096 // 128 // fixme -- this is just for testing -- too big
+ },
+
+ .subblock = {
+ .words = SKC_DEVICE_SUBBLOCK_WORDS, // words per subblock -- pow2
+ .bytes = SKC_DEVICE_SUBBLOCK_WORDS * sizeof(skc_uint) // bytes per subblock -- pow2
+ },
+
+ .block = {
+ .words = SKC_DEVICE_BLOCK_WORDS, // words per block -- pow2
+ .bytes = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint), // bytes per block -- pow2
+ .subblocks = SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS // subblocks per block -- block.bytes >= subblock.bytes
+ },
+
+ .block_pool = {
+ .pool_size = 524288, // blocks in pool -- 128 MB
+ .ring_pow2 = 524288, // blocks in pool rounded up pow2
+ .ring_mask = 524288 - 1
+ },
+
+ .cq_pool = {
+#ifndef NDEBUG
+ .type = SKC_CQ_TYPE_IN_ORDER_PROFILING,
+#else
+ .type = 0,
+#endif
+ .size = 8
+ },
+
+ .handle_pool = {
+ .size = 262144, // large fraction of block pool size (for now, 1:2)
+ .width = SKC_RECLAIM_ARRAY_SIZE,
+ .recs = 256 // too many? too few?
+ },
+
+ .tile = {
+ .width = SKC_TILE_WIDTH, // tile width in pixels
+ .height = SKC_TILE_HEIGHT, // tile height in pixels
+ .ratio = SKC_TILE_HEIGHT / SKC_TILE_WIDTH // subblocks per TTPB
+ },
+
+ .paths_copy = {
+
+ .buffer = {
+ .count = 16 // # of subbufs in buffer
+ },
+
+ .subbuf = {
+ .count = 1024 // # of blocks/commands in subbuf
+ },
+
+ .block = {
+ .subbuf = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint) * 1024, // block.bytes * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN
+ .buffer = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint) * 1024 * 16 // block.bytes * subbuf.blocks * subbuf.count
+ },
+
+ .command = {
+ .subbuf = sizeof(skc_uint) * 1024, // sizeof(skc_uint) * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN
+ .buffer = sizeof(skc_uint) * 1024 * 16 // sizeof(skc_uint) * subbuf.blocks * subbuf.count
+ },
+
+ // skc_uint paths_lowat;
+ },
+
+ .raster_cohort = {
+ .path_ids = {
+ .elem_count = 8192,
+ .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
+ },
+
+ .transforms = {
+ .elem_count = 8192,
+ .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
+ },
+
+ .clips = {
+ .elem_count = 8192,
+ .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
+ },
+
+ .fill = {
+ .elem_count = 8192,
+ .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
+ },
+
+ .raster_ids = {
+ .elem_count = 8192,
+ .snap_count = (1<<SKC_TTRK_HI_BITS_COHORT) // 256
+ },
+
+ .expand = {
+ .cmds = 1024*128,
+ },
+
+ .rasterize = {
+ .keys = 1024*1024
+ }
+ },
+
+ .composition = {
+ .cmds = {
+ .elem_count = 1024*16,
+ .snap_count = 1024
+ },
+ .raster_ids = {
+ .elem_count = 1024*1024
+ },
+ .keys = {
+ .elem_count = 1024*1024,
+ }
+ },
+ };
+
+//
+//
+//
+
+static char const cl_build_options_optimized[] =
+ "-cl-std=CL1.2 "
+ "-cl-single-precision-constant "
+ "-cl-denorms-are-zero "
+ "-cl-mad-enable "
+ "-cl-no-signed-zeros "
+ "-cl-fast-relaxed-math "
+ "-cl-kernel-arg-info ";
+
+static char const cl_build_options_debug[] =
+ "-cl-std=CL1.2 -cl-kernel-arg-info -g"; // -s c:/users/allanmac/home/google/skia_internal/src/compute/skc";
+
+// #define SKC_BUILD_OPTIONS cl_build_options_debug
+#define SKC_BUILD_OPTIONS cl_build_options_optimized
+
+//
+//
+//
+
+struct skc_program_source
+{
+ char const * name;
+ char const * options;
+ char const * src;
+ size_t const srclen;
+};
+
+//
+// THIS IS A RELATIVELY COMPACT WAY OF DECLARING EACH PROGRAM SOURCE
+// AND ITS BUILD OPTIONS
+//
+
+union skc_program_sources
+{
+ struct {
+ struct skc_program_source block_pool_init;
+ struct skc_program_source paths_copy;
+ struct skc_program_source fills_expand;
+ struct skc_program_source rasterize;
+ struct skc_program_source segment_ttrk;
+ struct skc_program_source rasters_alloc;
+ struct skc_program_source prefix;
+ struct skc_program_source place;
+ struct skc_program_source segment_ttck;
+ struct skc_program_source render;
+ struct skc_program_source paths_reclaim;
+ struct skc_program_source rasters_reclaim;
+ };
+ struct skc_program_source sources[];
+};
+
+typedef size_t * (*skc_grid_shaper)(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const global_work_size,
+ size_t * const local_work_size);
+struct skc_program_kernel
+{
+ char const * name;
+ skc_grid_shaper shaper;
+ skc_device_kernel_id id;
+};
+
+union skc_program_kernels
+{
+ struct {
+ struct skc_program_kernel block_pool_init[2];
+ struct skc_program_kernel paths_copy [2];
+ struct skc_program_kernel fills_expand [1];
+ struct skc_program_kernel rasterize [6];
+ struct skc_program_kernel segment_ttrk [1];
+ struct skc_program_kernel rasters_alloc [1];
+ struct skc_program_kernel prefix [1];
+ struct skc_program_kernel place [1];
+ struct skc_program_kernel segment_ttck [1];
+ struct skc_program_kernel render [1];
+ struct skc_program_kernel paths_reclaim [1];
+ struct skc_program_kernel rasters_reclaim[1];
+ };
+ struct skc_program_kernel kernels[];
+};
+
+//
+//
+//
+
+#if SKC_KERNEL_SPIRV // PROGRAM IS SPIR-V
+#define SKC_KERNEL_SUFFIX(n) n ## _pre_spv
+#elif SKC_KERNEL_BINARY // PROGRAM IS BINARY
+#define SKC_KERNEL_SUFFIX(n) n ## _pre_ir
+#elif SKC_KERNEL_SRC // PROGRAM IS SOURCE CODE
+#define SKC_KERNEL_SUFFIX(n) n ## _pre_cl
+#else
+#error "SKC_KERNEL_???"
+#endif
+
+//
+//
+//
+
+#define SKC_PROGRAM_SOURCE_EXPAND(k,s,o) .k = { SKC_STRINGIFY(k), o, s, sizeof(s) }
+#define SKC_PROGRAM_SOURCE(k,o) SKC_PROGRAM_SOURCE_EXPAND(k,SKC_KERNEL_SUFFIX(k),o)
+#define SKC_PROGRAM_KERNEL(k) "skc_kernel_" SKC_STRINGIFY(k), SKC_CONCAT(skc_device_shaper_,k)
+
+//
+//
+//
+
+static
+size_t *
+skc_device_shaper_block_pool_init_ids(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ work_dim [0] = 1;
+ work_global[0] = work_size;
+
+ return NULL; // let runtime figure out local work size
+}
+
+static
+size_t *
+skc_device_shaper_block_pool_init_atomics(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ work_dim [0] = 1;
+ work_global[0] = 2;
+
+ return NULL; // let runtime figure out local work size
+}
+
+static
+size_t *
+skc_device_shaper_paths_alloc(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ work_dim [0] = 1;
+ work_global[0] = 1;
+
+ return NULL; // let runtime figure out local work size
+}
+
+
+static
+size_t *
+skc_device_shaper_paths_copy(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ work_dim [0] = 1;
+ work_global[0] = SKC_PATHS_COPY_SUBGROUP_SIZE * work_size;
+#if 0
+ work_local [0] = SKC_PATHS_COPY_SUBGROUP_SIZE;
+
+ return work_local;
+#else
+ return NULL; // let runtime figure out local work size
+#endif
+}
+
+static
+size_t *
+skc_device_shaper_fills_expand(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ work_dim [0] = 1;
+ work_global[0] = SKC_FILLS_EXPAND_SUBGROUP_SIZE * work_size;
+ work_local [0] = SKC_FILLS_EXPAND_SUBGROUP_SIZE;
+
+ return work_local;
+}
+
+static
+size_t *
+skc_device_shaper_rasterize(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ work_dim [0] = 1;
+ work_global[0] = SKC_RASTERIZE_SUBGROUP_SIZE * work_size;
+ work_local [0] = SKC_RASTERIZE_SUBGROUP_SIZE;
+
+ return work_local;
+}
+
+static
+size_t *
+skc_device_shaper_rasterize_all(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
+}
+
+static
+size_t *
+skc_device_shaper_rasterize_lines(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
+}
+
+static
+size_t *
+skc_device_shaper_rasterize_quads(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
+}
+
+static
+size_t *
+skc_device_shaper_rasterize_cubics(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
+}
+
+static
+size_t *
+skc_device_shaper_rasterize_rat_quads(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
+}
+
+static
+size_t *
+skc_device_shaper_rasterize_rat_cubics(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
+}
+
+static
+size_t *
+skc_device_shaper_rasters_alloc(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ // round up to whole groups
+ size_t gs = SKC_ROUND_UP(work_size,SKC_RASTERS_ALLOC_GROUP_SIZE);
+
+ work_dim [0] = 1;
+ work_global[0] = gs;
+ work_local [0] = SKC_RASTERS_ALLOC_GROUP_SIZE;
+
+ return work_local;
+}
+
+static
+size_t *
+skc_device_shaper_segment_ttrk(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ // work_size is number of keys -- round up to a whole slab
+ size_t keys_ru = SKC_ROUND_UP(work_size,HS_LANES_PER_WARP*HS_KEYS_PER_LANE);
+
+ work_dim [0] = 1;
+ work_global[0] = keys_ru / HS_KEYS_PER_LANE;
+ work_local [0] = HS_LANES_PER_WARP; // or just return NULL
+
+ return work_local;
+}
+
+static
+size_t *
+skc_device_shaper_segment_ttck(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ // work_size is number of keys -- round up to a whole slab
+ size_t keys_ru = SKC_ROUND_UP(work_size,HS_LANES_PER_WARP*HS_KEYS_PER_LANE);
+
+ work_dim [0] = 1;
+ work_global[0] = keys_ru / HS_KEYS_PER_LANE;
+ work_local [0] = HS_LANES_PER_WARP; // or just return NULL
+
+ return work_local;
+}
+
+static
+size_t *
+skc_device_shaper_prefix(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ work_dim [0] = 1;
+ work_global[0] = SKC_PREFIX_SUBGROUP_SIZE * work_size;
+ work_local [0] = SKC_PREFIX_SUBGROUP_SIZE;
+
+ return work_local;
+}
+
+static
+size_t *
+skc_device_shaper_place(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ work_dim [0] = 1;
+ work_global[0] = SKC_PLACE_SUBGROUP_SIZE * work_size;
+ work_local [0] = SKC_PLACE_SUBGROUP_SIZE;
+
+ return work_local;
+}
+
+static
+size_t *
+skc_device_shaper_render(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ work_dim [0] = 1;
+ work_global[0] = SKC_RENDER_SUBGROUP_SIZE * work_size;
+ work_local [0] = SKC_RENDER_SUBGROUP_SIZE;
+
+ return work_local;
+}
+
+static
+size_t *
+skc_device_shaper_paths_reclaim(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ assert(work_size == SKC_RECLAIM_ARRAY_SIZE);
+
+ work_dim [0] = 1;
+ work_global[0] = SKC_RECLAIM_ARRAY_SIZE * SKC_PATHS_RECLAIM_SUBGROUP_SIZE;
+
+ return NULL; // let runtime figure out local work size
+}
+
+static
+size_t *
+skc_device_shaper_rasters_reclaim(size_t const work_size,
+ cl_uint * const work_dim,
+ size_t * const work_global,
+ size_t * const work_local)
+{
+ assert(work_size == SKC_RECLAIM_ARRAY_SIZE);
+
+ work_dim [0] = 1;
+ work_global[0] = SKC_RECLAIM_ARRAY_SIZE * SKC_PATHS_RECLAIM_SUBGROUP_SIZE;
+
+ return NULL; // let runtime figure out local work size
+}
+
+//
+//
+//
+
+static union skc_program_sources const program_sources = {
+ SKC_PROGRAM_SOURCE(block_pool_init,SKC_BUILD_OPTIONS),
+ SKC_PROGRAM_SOURCE(paths_copy, SKC_BUILD_OPTIONS),
+ SKC_PROGRAM_SOURCE(fills_expand, SKC_BUILD_OPTIONS),
+ SKC_PROGRAM_SOURCE(rasterize, SKC_BUILD_OPTIONS),
+ SKC_PROGRAM_SOURCE(segment_ttrk, SKC_BUILD_OPTIONS),
+ SKC_PROGRAM_SOURCE(rasters_alloc, SKC_BUILD_OPTIONS),
+ SKC_PROGRAM_SOURCE(prefix, SKC_BUILD_OPTIONS),
+ SKC_PROGRAM_SOURCE(place, SKC_BUILD_OPTIONS),
+ SKC_PROGRAM_SOURCE(segment_ttck, SKC_BUILD_OPTIONS),
+ SKC_PROGRAM_SOURCE(render, SKC_BUILD_OPTIONS),
+ SKC_PROGRAM_SOURCE(paths_reclaim, SKC_BUILD_OPTIONS),
+ SKC_PROGRAM_SOURCE(rasters_reclaim,SKC_BUILD_OPTIONS)
+};
+
+static union skc_program_kernels const program_kernels = {
+
+ .block_pool_init = { { SKC_PROGRAM_KERNEL(block_pool_init_ids), SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS },
+ { SKC_PROGRAM_KERNEL(block_pool_init_atomics), SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS } },
+
+ .paths_copy = { { SKC_PROGRAM_KERNEL(paths_alloc), SKC_DEVICE_KERNEL_ID_PATHS_ALLOC },
+ { SKC_PROGRAM_KERNEL(paths_copy) , SKC_DEVICE_KERNEL_ID_PATHS_COPY } },
+
+ .fills_expand = { { SKC_PROGRAM_KERNEL(fills_expand), SKC_DEVICE_KERNEL_ID_FILLS_EXPAND } },
+
+ .rasterize = { { SKC_PROGRAM_KERNEL(rasterize_all), SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL },
+ { SKC_PROGRAM_KERNEL(rasterize_lines), SKC_DEVICE_KERNEL_ID_RASTERIZE_LINES },
+ { SKC_PROGRAM_KERNEL(rasterize_quads), SKC_DEVICE_KERNEL_ID_RASTERIZE_QUADS },
+ { SKC_PROGRAM_KERNEL(rasterize_cubics), SKC_DEVICE_KERNEL_ID_RASTERIZE_CUBICS },
+ { SKC_PROGRAM_KERNEL(rasterize_rat_quads), SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_QUADS },
+ { SKC_PROGRAM_KERNEL(rasterize_rat_cubics), SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_CUBICS } },
+
+ .segment_ttrk = { { SKC_PROGRAM_KERNEL(segment_ttrk), SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK } },
+
+ .rasters_alloc = { { SKC_PROGRAM_KERNEL(rasters_alloc), SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC } },
+
+ .prefix = { { SKC_PROGRAM_KERNEL(prefix), SKC_DEVICE_KERNEL_ID_PREFIX } },
+
+ .place = { { SKC_PROGRAM_KERNEL(place), SKC_DEVICE_KERNEL_ID_PLACE } },
+
+ .segment_ttck = { { SKC_PROGRAM_KERNEL(segment_ttck) , SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK } },
+
+ .render = { { SKC_PROGRAM_KERNEL(render), SKC_DEVICE_KERNEL_ID_RENDER } },
+
+ .paths_reclaim = { { SKC_PROGRAM_KERNEL(paths_reclaim), SKC_DEVICE_KERNEL_ID_PATHS_RECLAIM } },
+
+ .rasters_reclaim = { { SKC_PROGRAM_KERNEL(rasters_reclaim), SKC_DEVICE_KERNEL_ID_RASTERS_RECLAIM } }
+};
+
+//
+//
+//
+
+struct skc_device
+{
+ //
+ // FIXME -- an OpenCL 2.1+ device would clone these kernels in a
+ // multithreaded system.
+ //
+ // Not having the ability to clone kernels (yet set their sticky
+ // args) was an oversight in previous versions of OpenCL.
+ //
+ // For now, we can probably get away with just a single kernel
+ // instance as long as args are set and the kernel is launched
+ // before having its arguments stomped on.
+ //
+ cl_kernel kernels [SKC_DEVICE_KERNEL_ID_COUNT];
+ size_t reqd_szs[SKC_DEVICE_KERNEL_ID_COUNT][3];
+};
+
+//
+// CREATE KERNELS
+//
+
+static
+void
+skc_device_create_kernels(struct skc_runtime * const runtime,
+ struct skc_program_kernel const * const kernels,
+ skc_uint const kernel_count,
+ cl_program program)
+{
+ for (skc_uint ii=0; ii<kernel_count; ii++)
+ {
+ cl_int cl_err;
+
+ char const * name = kernels[ii].name;
+ skc_uint const id = kernels[ii].id;
+
+ fprintf(stderr,"\t\"%s\"\n",name);
+
+ // create the kernel
+ runtime->device->kernels[id] = clCreateKernel(program,name,&cl_err); cl_ok(cl_err);
+
+ //
+ // release program now
+ //
+ // FIXME -- if/when we multithread then we need to clone kernels
+ // (>=2.1) or keep programs around (<=2.0)
+ //
+
+ // get workgroup size
+ cl(GetKernelWorkGroupInfo(runtime->device->kernels[id],
+ runtime->cl.device_id,
+ CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
+ sizeof(runtime->device->reqd_szs[0]),
+ runtime->device->reqd_szs[id],
+ NULL));
+
+ //
+ // GEN9+ PROBING
+ //
+#define SKC_TARGET_GEN9
+#ifdef SKC_TARGET_GEN9
+
+#define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108
+#define CL_KERNEL_SPILL_MEM_SIZE_INTEL 0x4109
+#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL 0x410A
+
+ cl_ulong spill_mem_size;
+
+ cl(GetKernelWorkGroupInfo(runtime->device->kernels[id],
+ runtime->cl.device_id,
+ CL_KERNEL_SPILL_MEM_SIZE_INTEL,
+ sizeof(spill_mem_size),
+ &spill_mem_size,
+ NULL));
+
+ fprintf(stderr,"\t\tspill mem size: %lu bytes\n",
+ (unsigned long)spill_mem_size);
+
+ cl_ulong local_mem_size;
+
+ cl(GetKernelWorkGroupInfo(runtime->device->kernels[id],
+ runtime->cl.device_id,
+ CL_KERNEL_LOCAL_MEM_SIZE,
+ sizeof(local_mem_size),
+ &local_mem_size,
+ NULL));
+
+ fprintf(stderr,"\t\tlocal mem size: %lu bytes\n",
+ (unsigned long)local_mem_size);
+#endif
+ }
+}
+
+static
+void
+skc_device_build_program(struct skc_runtime * const runtime,
+ struct skc_program_source const * const source,
+ struct skc_program_kernel const * const kernels,
+ skc_uint const kernel_count)
+{
+ cl_program program;
+
+ fprintf(stderr,"%-20s: ",source->name);
+
+ cl_int cl_err;
+
+#if SKC_KERNEL_SPIRV // PROGRAM IS SPIR-V
+
+ fprintf(stderr,"Creating (SPIR-V) ... ");
+
+ program = clCreateProgramWithIL(runtime->cl.context,
+ source->src,
+ source->srclen,
+ &cl_err);
+
+#elif SKC_KERNEL_BINARY // PROGRAM IS BINARY
+
+ fprintf(stderr,"Creating (Binary) ... ");
+
+ cl_int status;
+ program = clCreateProgramWithBinary(runtime->cl.context,
+ 1,
+ &runtime->cl.device_id,
+ &source->srclen,
+ (unsigned char const *[]){ source->src },
+ &status,
+ &cl_err);
+
+#elif SKC_KERNEL_SRC // PROGRAM IS SOURCE CODE
+
+ fprintf(stderr,"Creating (Source) ... ");
+
+ program = clCreateProgramWithSource(runtime->cl.context,
+ 1,
+ (char const *[]){ source->src },
+ &source->srclen,
+ &cl_err);
+#else
+
+#error "SKC_KERNEL_???"
+
+#endif
+
+ cl_ok(cl_err);
+
+ fprintf(stderr,"Building ... ");
+
+ // build the program
+ cl(BuildProgram(program,
+ 1,
+ &runtime->cl.device_id,
+ source->options, // build options are ignored by binary
+ NULL,
+ NULL));
+
+ fprintf(stderr,"Done\n");
+
+ // build the kernels
+ skc_device_create_kernels(runtime,kernels,kernel_count,program);
+
+ // we're done with program for now
+ // can always recover it from a kernel instance
+ cl(ReleaseProgram(program));
+}
+
+//
+// RELEASE KERNELS
+//
+
+static
+void
+skc_device_release_kernels(struct skc_device * const device)
+{
+ for (skc_int ii=0; ii<SKC_COUNT_OF(device->kernels); ii++)
+ cl(ReleaseKernel(device->kernels[ii]));
+}
+
+
+
+cl_kernel
+skc_device_acquire_kernel(struct skc_device * const device,
+ skc_device_kernel_id const type)
+{
+ cl_kernel kernel = device->kernels[type];
+
+ cl(RetainKernel(kernel));
+
+ return kernel;
+}
+
+//
+// INITIALIZE KERNEL ARGS
+//
+// FIXME
+//
+// pre-assign any kernel arguments that are never going to change --
+// for example, the block pool
+//
+
+//
+//
+//
+
+#define SKC_DEVICE_BUILD_PROGRAM(p) \
+ skc_device_build_program(runtime,&program_sources.p,program_kernels.p,SKC_COUNT_OF(program_kernels.p))
+
+
+void
+skc_device_create(struct skc_runtime * const runtime)
+{
+ struct skc_device * const device = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*device));
+
+ // hang device off of runtime
+ runtime->device = device;
+
+ // hang config off of runtime
+ runtime->config = &config;
+
+ // create kernels
+ SKC_DEVICE_BUILD_PROGRAM(block_pool_init);
+ SKC_DEVICE_BUILD_PROGRAM(paths_copy);
+ SKC_DEVICE_BUILD_PROGRAM(fills_expand);
+ SKC_DEVICE_BUILD_PROGRAM(rasterize);
+ SKC_DEVICE_BUILD_PROGRAM(segment_ttrk);
+ SKC_DEVICE_BUILD_PROGRAM(rasters_alloc);
+ SKC_DEVICE_BUILD_PROGRAM(prefix);
+ SKC_DEVICE_BUILD_PROGRAM(place);
+ SKC_DEVICE_BUILD_PROGRAM(segment_ttck);
+ SKC_DEVICE_BUILD_PROGRAM(render);
+ SKC_DEVICE_BUILD_PROGRAM(paths_reclaim);
+ SKC_DEVICE_BUILD_PROGRAM(rasters_reclaim);
+
+ // create HotSort instance -- FIXME -- how this occurs needs to be cleaned up
+ hs_create(runtime->cl.context,runtime->cl.device_id,NULL);
+}
+
+void
+skc_device_dispose(struct skc_runtime * const runtime)
+{
+ //
+ // FIXME -- dispose of programs, kernels, etc.
+ //
+
+ skc_runtime_host_perm_free(runtime,runtime->device);
+}
+
+//
+// FIXME -- just pass the device type
+//
+
+void
+skc_device_enqueue_kernel(struct skc_device * const device,
+ skc_device_kernel_id const type,
+ cl_command_queue cq,
+ cl_kernel kernel,
+ size_t const work_size,
+ cl_uint num_events_in_wait_list,
+ cl_event const * const event_wait_list,
+ cl_event * const event)
+{
+ if (work_size == 0)
+ return;
+
+ cl_uint work_dim [1];
+ size_t work_global[3];
+ size_t work_local [3];
+
+ size_t * work_local_ptr = program_kernels.kernels[type].shaper(work_size,
+ work_dim,
+ work_global,
+ work_local);
+ cl(EnqueueNDRangeKernel(cq,
+ kernel,// device->kernels[type],
+ work_dim[0],
+ NULL,
+ work_global,
+ work_local_ptr,
+ num_events_in_wait_list,
+ event_wait_list,
+ event));
+}
+
+//
+//
+//