diff options
author | Allan MacKinnon <allanmac@google.com> | 2018-06-21 09:09:56 -0700 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2018-06-21 16:52:47 +0000 |
commit | c110e7941e4e051ad9004412de7b419da8bcf270 (patch) | |
tree | f3f0bfab677b0581d237db540b19bb2e97d40338 /src/compute/skc | |
parent | 867ce8fc8eef76e26b1e56be66badffc3d5ec3ae (diff) |
OpenGL interop is simplified when the cl_context is not created by SKC.
Added GEN9 HotSort kernels so the hs_cl_gen9 lib and hs_bench_cl app can be built.
Bug: skia:
Change-Id: I5b21d33499a6ec3524f39a51443981802b722c8b
Reviewed-on: https://skia-review.googlesource.com/136608
Commit-Queue: Allan MacKinnon <allanmac@google.com>
Reviewed-by: Mike Reed <reed@google.com>
Reviewed-by: Mike Klein <mtklein@google.com>
Diffstat (limited to 'src/compute/skc')
31 files changed, 10285 insertions, 10522 deletions
diff --git a/src/compute/skc/context.c b/src/compute/skc/context.c index 8aac2ef3d1..59c7956fd5 100644 --- a/src/compute/skc/context.c +++ b/src/compute/skc/context.c @@ -28,23 +28,20 @@ // skc_err -skc_context_create(skc_context_t * context, - char const * target_platform_substring, - char const * target_device_substring, - intptr_t context_properties[]) +skc_context_create_cl(skc_context_t * context, + cl_context context_cl, + cl_device_id device_id_cl) { (*context) = malloc(sizeof(**context)); // - // FIXME -- don't directly grab a CL runtime but for now juts create - // the CL_12 runtime here + // FIXME -- we'll clean up context creation by platform later. For + // now, just create a CL_12 context. // skc_err err; - err = skc_runtime_cl_12_create(*context, - target_platform_substring, - target_device_substring, - context_properties); + err = skc_runtime_cl_12_create(*context,context_cl,device_id_cl); + return err; } diff --git a/src/compute/skc/main.c b/src/compute/skc/main.c index 8261f4bdf8..e0d42b31e0 100644 --- a/src/compute/skc/main.c +++ b/src/compute/skc/main.c @@ -21,6 +21,11 @@ #include <stdlib.h> #include <conio.h> +#include "skc_create_cl.h" + +#include "common/cl/find_cl.h" +#include "common/cl/assert_cl.h" + #include "svg/svg_doc.h" #include "svg2skc/svg2skc.h" #include "svg2skc/transform_stack.h" @@ -49,7 +54,7 @@ skc_runtime_cl_12_debug(struct skc_context * const context); // // -static +static void is_render_complete(skc_surface_t surface, skc_styling_t styling, @@ -67,9 +72,9 @@ int main(int argc, char** argv) { // - // // - if (argc <= 1) + // + if (argc <= 1) { fprintf(stderr,"-- missing filename\n"); return EXIT_FAILURE; // no filename @@ -95,28 +100,49 @@ main(int argc, char** argv) skc_interop_init(&window); // + // find platform and device by name + // + cl_platform_id platform_id_cl; + cl_device_id device_id_cl; + + cl(FindIdsByName("Intel","Graphics", + &platform_id_cl, + &device_id_cl, + 0,NULL,NULL, + true)); + + // // get GL and device contexts // HGLRC hGLRC = wglGetCurrentContext(); HDC hDC = wglGetCurrentDC(); // + // create the CL context // - // - cl_context_properties context_properties[] = + cl_context_properties context_properties_cl[] = { - CL_CONTEXT_PLATFORM, (cl_context_properties)-1, + CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id_cl, CL_GL_CONTEXT_KHR, (cl_context_properties)hGLRC, CL_WGL_HDC_KHR, (cl_context_properties)hDC, 0 }; - + + cl_int cl_err; + cl_context context_cl = clCreateContext(context_properties_cl, + 1, + &device_id_cl, + NULL, + NULL, + &cl_err); cl_ok(cl_err); // - // create context + // create SKC context // skc_context_t context; - skc_err err = skc_context_create(&context,"Intel","Graphics",context_properties); + skc_err err = skc_context_create_cl(&context, + context_cl, + device_id_cl); // // associate @@ -136,14 +162,14 @@ main(int argc, char** argv) skc_raster_builder_t raster_builder; err = skc_raster_builder_create(context,&raster_builder); - + // // create a composition // skc_composition_t composition; err = skc_composition_create(context,&composition); - + // // create a styling instance // @@ -154,7 +180,7 @@ main(int argc, char** argv) svg_doc_layer_count(svg_doc), 1000, 2 * 1024 * 1024); - + // // create a surface // @@ -191,7 +217,7 @@ main(int argc, char** argv) skc_transform_stack_restore(ts,ts_save); // decode layers -- places rasters - svg_doc_layers_decode(svg_doc,rasters,composition,styling,true/*is_srgb*/); + svg_doc_layers_decode(svg_doc,rasters,composition,styling,true/*is_srgb*/); // seal the composition skc_composition_seal(composition); @@ -244,7 +270,7 @@ main(int argc, char** argv) // unseal the composition skc_composition_unseal(composition,true); } - + // // dispose of mundane resources // diff --git a/src/compute/skc/platforms/cl_12/allocator_device_cl.c b/src/compute/skc/platforms/cl_12/allocator_device_cl.c index aa44f36e87..90ae26eb71 100644 --- a/src/compute/skc/platforms/cl_12/allocator_device_cl.c +++ b/src/compute/skc/platforms/cl_12/allocator_device_cl.c @@ -106,7 +106,7 @@ skc_allocator_device_create(struct skc_runtime * const runtime) &runtime->allocator.device.temp.suballocator, "DEVICE", runtime->config->suballocator.device.subbufs, - runtime->cl.base_align, + runtime->cl.align_bytes, runtime->config->suballocator.device.size); #ifndef NDEBUG diff --git a/src/compute/skc/platforms/cl_12/config_cl.h b/src/compute/skc/platforms/cl_12/config_cl.h index 0172857b07..ac5cd76710 100644 --- a/src/compute/skc/platforms/cl_12/config_cl.h +++ b/src/compute/skc/platforms/cl_12/config_cl.h @@ -12,7 +12,6 @@ // // -#include "runtime_cl.h" #include "block_pool_cl.h" // @@ -52,8 +51,8 @@ struct skc_config union skc_block_pool_size block_pool; struct { - skc_cq_type_e type; - skc_uint size; + cl_command_queue_properties cq_props; + skc_uint size; } cq_pool; struct { diff --git a/src/compute/skc/platforms/cl_12/cq_pool_cl.c b/src/compute/skc/platforms/cl_12/cq_pool_cl.c index 80cfe34cf8..8d1537dc40 100644 --- a/src/compute/skc/platforms/cl_12/cq_pool_cl.c +++ b/src/compute/skc/platforms/cl_12/cq_pool_cl.c @@ -7,17 +7,18 @@ */ // -// +// squelch OpenCL 1.2 deprecation warning // -#ifndef NDEBUG -#include <stdio.h> +#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS #endif // // // +#include <stdio.h> #include <string.h> // @@ -25,6 +26,7 @@ // #include "runtime_cl_12.h" +#include "common/cl/assert_cl.h" // // This implementation is probably excessive. @@ -40,21 +42,77 @@ // // +static +cl_command_queue +skc_runtime_cl_12_create_cq(struct skc_runtime * const runtime, + struct skc_cq_pool * const pool) + +{ + cl_command_queue cq; + +#if 1 + // + // <= OpenCL 1.2 + // + cl_int cl_err; + + cq = clCreateCommandQueue(runtime->cl.context, + runtime->cl.device_id, + pool->cq_props, + &cl_err); cl_ok(cl_err); +#else + if (runtime_cl->version.major < 2) + { + // + // <= OpenCL 1.2 + // + cl_int cl_err; + + cq = clCreateCommandQueue(runtime_cl->context, + runtime_cl->device_id, + (cl_command_queue_properties)type, + &cl_err); cl_ok(cl_err); + } + else + { + // + // >= OpenCL 2.0 + // + cl_int cl_err; + cl_queue_properties const queue_properties[] = { + CL_QUEUE_PROPERTIES,(cl_queue_properties)type,0 + }; + + cq = clCreateCommandQueueWithProperties(runtime_cl->context, + runtime_cl->device_id, + queue_properties, + &cl_err); cl_ok(cl_err); + } +#endif + + return cq; +} + +// +// +// + void -skc_cq_pool_create(struct skc_runtime * const runtime, - struct skc_cq_pool * const pool, - skc_uint const type, - skc_uint const size) +skc_cq_pool_create(struct skc_runtime * const runtime, + struct skc_cq_pool * const pool, + cl_command_queue_properties const cq_props, + skc_uint const size) { - pool->type = type; - pool->size = size + 1; // an empty spot - pool->reads = 0; - pool->writes = size; - pool->cq = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,pool->size * sizeof(*pool->cq)); + pool->size = size + 1; // an empty spot + pool->reads = 0; + pool->writes = size; + + pool->cq_props = cq_props; + pool->cq = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE, + pool->size * sizeof(*pool->cq)); + for (skc_uint ii=0; ii<size; ii++) + pool->cq[ii] = skc_runtime_cl_12_create_cq(runtime,pool); - for (skc_uint ii=0; ii<size; ii++) { - pool->cq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type); - } pool->cq[size] = NULL; } @@ -77,7 +135,7 @@ skc_cq_pool_dispose(struct skc_runtime * const runtime, // // -static +static void skc_cq_pool_write(struct skc_cq_pool * const pool, cl_command_queue cq) @@ -109,14 +167,14 @@ skc_cq_pool_expand(struct skc_runtime * const runtime, pool->writes = expand; for (skc_uint ii=0; ii<expand; ii++) - pool->cq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type); + pool->cq[ii] = skc_runtime_cl_12_create_cq(runtime,pool); } // // // -static +static cl_command_queue skc_cq_pool_read(struct skc_runtime * const runtime, struct skc_cq_pool * const pool) @@ -141,7 +199,7 @@ skc_runtime_acquire_cq_in_order(struct skc_runtime * const runtime) } void -skc_runtime_release_cq_in_order(struct skc_runtime * const runtime, +skc_runtime_release_cq_in_order(struct skc_runtime * const runtime, cl_command_queue cq) { skc_cq_pool_write(&runtime->cq_pool,cq); diff --git a/src/compute/skc/platforms/cl_12/cq_pool_cl.h b/src/compute/skc/platforms/cl_12/cq_pool_cl.h index 0cc73a2f82..c614600e19 100644 --- a/src/compute/skc/platforms/cl_12/cq_pool_cl.h +++ b/src/compute/skc/platforms/cl_12/cq_pool_cl.h @@ -20,11 +20,12 @@ struct skc_cq_pool { - skc_cq_type_e type; - skc_uint size; - skc_uint reads; - skc_uint writes; - cl_command_queue * cq; + cl_command_queue * cq; + cl_command_queue_properties cq_props; + + skc_uint size; + skc_uint reads; + skc_uint writes; }; //l @@ -32,10 +33,10 @@ struct skc_cq_pool // void -skc_cq_pool_create(struct skc_runtime * const runtime, - struct skc_cq_pool * const pool, - skc_uint const type, - skc_uint const size); +skc_cq_pool_create(struct skc_runtime * const runtime, + struct skc_cq_pool * const pool, + cl_command_queue_properties const cq_props, + skc_uint const size); void skc_cq_pool_dispose(struct skc_runtime * const runtime, diff --git a/src/compute/skc/platforms/cl_12/device_cl_12.h b/src/compute/skc/platforms/cl_12/device_cl_12.h index 637b61ae10..ef574958b3 100644 --- a/src/compute/skc/platforms/cl_12/device_cl_12.h +++ b/src/compute/skc/platforms/cl_12/device_cl_12.h @@ -77,6 +77,10 @@ cl_kernel skc_device_acquire_kernel(struct skc_device * const device, skc_device_kernel_id const type); +void +skc_device_release_kernel(struct skc_device * const device, + cl_kernel kernel); + // // grid shape can vary greatly by target platform // diff --git a/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl b/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl index 726b0a7907..5abbe18939 100644 --- a/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl +++ b/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl @@ -1,64 +1,64 @@ -/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "device_cl_12.h"
-
-//
-// BEST TO RUN THESE ON AN OUT-OF-ORDER CQ
-//
-
-__kernel
-SKC_BP_INIT_IDS_KERNEL_ATTRIBS
-void
-skc_kernel_block_pool_init_ids(__global uint * const ids, uint const bp_size)
-{
- uint const gid = get_global_id(0);
-
- //
- // FIXME -- TUNE FOR ARCH -- evaluate if it's much faster to
- // accomplish this with fewer threads and using either IPC and/or
- // vector stores -- it should be on certain architectures!
- //
-
- //
- // initialize pool with sequence
- //
- if (gid < bp_size)
- ids[gid] = gid * SKC_DEVICE_SUBBLOCKS_PER_BLOCK;
-}
-
-//
-//
-//
-
-__kernel
-SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS
-void
-skc_kernel_block_pool_init_atomics(__global uint * const bp_atomics, uint const bp_size)
-{
- // the version test is to squelch a bug with the Intel OpenCL CPU
- // compiler declaring it supports the cl_intel_subgroups extension
-#if defined(cl_intel_subgroups) || defined (cl_khr_subgroups)
- uint const tid = get_sub_group_local_id();
-#else
- uint const tid = get_local_id(0);
-#endif
-
- //
- // launch two threads and store [ 0, bp_size ]
- //
- bp_atomics[tid] = tid * bp_size;
-}
-
-//
-//
-//
+/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "kernel_cl_12.h" + +// +// BEST TO RUN THESE ON AN OUT-OF-ORDER CQ +// + +__kernel +SKC_BP_INIT_IDS_KERNEL_ATTRIBS +void +skc_kernel_block_pool_init_ids(__global uint * const ids, uint const bp_size) +{ + uint const gid = get_global_id(0); + + // + // FIXME -- TUNE FOR ARCH -- evaluate if it's much faster to + // accomplish this with fewer threads and using either IPC and/or + // vector stores -- it should be on certain architectures! + // + + // + // initialize pool with sequence + // + if (gid < bp_size) + ids[gid] = gid * SKC_DEVICE_SUBBLOCKS_PER_BLOCK; +} + +// +// +// + +__kernel +SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS +void +skc_kernel_block_pool_init_atomics(__global uint * const bp_atomics, uint const bp_size) +{ + // the version test is to squelch a bug with the Intel OpenCL CPU + // compiler declaring it supports the cl_intel_subgroups extension +#if defined(cl_intel_subgroups) || defined (cl_khr_subgroups) + uint const tid = get_sub_group_local_id(); +#else + uint const tid = get_local_id(0); +#endif + + // + // launch two threads and store [ 0, bp_size ] + // + bp_atomics[tid] = tid * bp_size; +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c index aebe8fdc1d..f7e06a1062 100644 --- a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c +++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c @@ -19,6 +19,7 @@ #include "config_cl.h" #include "runtime_cl_12.h" +#include "kernel_cl_12.h" #include "device_cl_12.h" #include "hs/cl/hs_cl_launcher.h" @@ -124,9 +125,9 @@ struct skc_config const config = .cq_pool = { #ifndef NDEBUG - .type = SKC_CQ_TYPE_IN_ORDER_PROFILING, + .cq_props = CL_QUEUE_PROFILING_ENABLE, #else - .type = 0, + .cq_props = 0, #endif .size = 8 }, @@ -841,6 +842,14 @@ skc_device_acquire_kernel(struct skc_device * const device, return kernel; } + +void +skc_device_release_kernel(struct skc_device * const device, + cl_kernel kernel) +{ + cl(ReleaseKernel(kernel)); +} + // // INITIALIZE KERNEL ARGS // diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.h b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/kernel_cl_12.h index 0cac2261e7..0cac2261e7 100644 --- a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.h +++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/kernel_cl_12.h diff --git a/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl b/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl index 39fee75f3d..bcff0a37c1 100644 --- a/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl +++ b/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl @@ -1,309 +1,309 @@ -/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "block.h"
-#include "path.h"
-#include "common.h"
-#include "atomic_cl.h"
-#include "raster_builder_cl_12.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
-
-#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
-#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
-
-#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
-
-//
-//
-//
-
-#define SKC_FILLS_EXPAND_X (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
-
-//
-//
-//
-
-#if ( SKC_FILLS_EXPAND_X == 1 )
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_1()
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 0
-
-#elif ( SKC_FILLS_EXPAND_X == 2 )
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_2()
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 1
-
-#elif ( SKC_FILLS_EXPAND_X == 4 )
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_4()
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 3
-
-#elif ( SKC_FILLS_EXPAND_X == 8 )
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_8()
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 7
-
-#elif ( SKC_FILLS_EXPAND_X == 16)
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_16()
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 15
-
-#else
-#error "MISSING SKC_FILLS_EXPAND_X"
-#endif
-
-//
-// Fill and rasterize cmds only differ in their first word semantics
-//
-
-union skc_cmd_expand
-{
- union skc_cmd_fill fill;
- union skc_cmd_rasterize rasterize;
-};
-
-//
-//
-//
-
-union skc_path_elem
-{
- skc_uint u32;
- skc_float f32;
-};
-
-//
-// COMPILE-TIME AND RUN-TIME MACROS
-//
-
-#define SKC_ELEM_IN_RANGE(X,I) \
- (skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) && \
- (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
-
-#define SKC_ELEM_GTE(X,I) \
- SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
-
-//
-// FIXME -- slate these for replacement
-//
-
-#define SKC_BROADCAST(E,S,I) \
- sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
-
-#define SKC_BROADCAST_LAST_HELPER(E,I) \
- sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
-
-#define SKC_BROADCAST_LAST(E,I) \
- SKC_BROADCAST_LAST_HELPER(E,I)
-
-//
-//
-//
-
-void
-skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out,
- skc_uint * const out_idx,
- union skc_cmd_expand * const cmd,
- union skc_path_elem const e,
- skc_uint const e_idx)
-{
- //
- // FIXME -- we can append a large number of nodeword indices to a
- // local SMEM queue and flush when full. It may or may not be a
- // performance win on some architectures.
- //
- skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT;
- skc_uint const offset = sub_group_scan_inclusive_add(is_elem ? 1 : 0);
-
- cmd->rasterize.nodeword = e_idx;
-
- if (is_elem) {
- cmds_out[*out_idx + offset] = cmd->rasterize;
- }
-
- *out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1);
-}
-
-//
-//
-//
-
-__kernel
-SKC_FILLS_EXPAND_KERNEL_ATTRIBS
-void
-skc_kernel_fills_expand(__global union skc_path_elem const * const blocks,
- __global skc_uint volatile * const atomics,
- __global skc_block_id_t const * const map,
- __global union skc_cmd_fill const * const cmds_in,
- __global union skc_cmd_rasterize * const cmds_out)
-{
- //
- // Need to harmonize the way we determine a subgroup's id. In this
- // kernel it's not as important because no local memory is being
- // used. Although the device/mask calc to determine subgroup and
- // lanes is still proper, we might want to make it clearer that
- // we're working with subgroups by using the subgroup API.
- //
- // every subgroup/simd that will work on the block loads the same command
- //
-#if (__OPENCL_VERSION__ < 200)
- skc_uint const cmd_stride = get_num_sub_groups();
-#else
- skc_uint const cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
-#endif
- skc_uint cmd_idx = get_group_id(0) * cmd_stride + get_sub_group_id();
-
- // load fill command -- we reuse y component
- union skc_cmd_expand cmd = { .fill = cmds_in[cmd_idx] };
-
- // get the path header block from the map
- skc_block_id_t id = map[cmd.fill.path];
-
-#if 0
- if (get_sub_group_local_id() == 0)
- printf("expand[%u] = %u\n",cmd_idx,id);
-#endif
-
- //
- // blindly load all of the head elements into registers
- //
- skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
-
- SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
- //
- // pick out count.nodes and count.prims from the header
- //
- skc_uint count_nodes, count_prims;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
- count_nodes = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I); \
- } \
- if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) { \
- count_prims = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I); \
- }
-
- SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
- //
- // debug of path head
- //
-#if 0
- skc_uint count_blocks;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
- count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
- }
-
- SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
- if (get_sub_group_local_id() == 0)
- printf("path header = { %5u, %5u, %5u }\n",
- count_blocks,count_nodes,count_prims);
-#endif
-
- //
- // acquire slots in the expanded cmd extent
- //
- // decrement prim_idx by 1 so we can use inclusive warp scan later
- //
- skc_uint out_idx = 0;
-
- if (get_sub_group_local_id() == 0) {
- out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP
- (atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1;
- }
-
- out_idx = sub_group_broadcast(out_idx,0);
-
- //
- // process ids trailing the path header
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \
- if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \
- if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \
- h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID; \
- } \
- } \
- skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I, \
- head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \
- }
-
- SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
- //
- // we're done if it was just the header
- //
- if (count_nodes == 0)
- return;
-
- //
- // otherwise, process the nodes
- //
-
- //
- // get id of next node
- //
- id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
-
- //
- // the following blocks are nodes
- //
- while (true)
- {
- // get index of each element
- skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
-
- //
- // blindly load all of the node elements into registers
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
-
- SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
- //
- // append all valid ids
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I, \
- node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE);
-
- SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
- // any more nodes?
- if (--count_nodes == 0)
- return;
-
- //
- // get id of next node
- //
- id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
- }
-}
-
-//
-//
-//
+/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "block.h" +#include "path.h" +#include "common.h" +#include "atomic_cl.h" +#include "raster_builder_cl_12.h" +#include "kernel_cl_12.h" + +// +// +// + +#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1) + +#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS) +#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS) + +#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE) + +// +// +// + +#define SKC_FILLS_EXPAND_X (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE) + +// +// +// + +#if ( SKC_FILLS_EXPAND_X == 1 ) +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_1() +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 0 + +#elif ( SKC_FILLS_EXPAND_X == 2 ) +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_2() +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 1 + +#elif ( SKC_FILLS_EXPAND_X == 4 ) +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_4() +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 3 + +#elif ( SKC_FILLS_EXPAND_X == 8 ) +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_8() +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 7 + +#elif ( SKC_FILLS_EXPAND_X == 16) +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_16() +#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 15 + +#else +#error "MISSING SKC_FILLS_EXPAND_X" +#endif + +// +// Fill and rasterize cmds only differ in their first word semantics +// + +union skc_cmd_expand +{ + union skc_cmd_fill fill; + union skc_cmd_rasterize rasterize; +}; + +// +// +// + +union skc_path_elem +{ + skc_uint u32; + skc_float f32; +}; + +// +// COMPILE-TIME AND RUN-TIME MACROS +// + +#define SKC_ELEM_IN_RANGE(X,I) \ + (skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) && \ + (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) + +#define SKC_ELEM_GTE(X,I) \ + SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) + +// +// FIXME -- slate these for replacement +// + +#define SKC_BROADCAST(E,S,I) \ + sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE) + +#define SKC_BROADCAST_LAST_HELPER(E,I) \ + sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1) + +#define SKC_BROADCAST_LAST(E,I) \ + SKC_BROADCAST_LAST_HELPER(E,I) + +// +// +// + +void +skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out, + skc_uint * const out_idx, + union skc_cmd_expand * const cmd, + union skc_path_elem const e, + skc_uint const e_idx) +{ + // + // FIXME -- we can append a large number of nodeword indices to a + // local SMEM queue and flush when full. It may or may not be a + // performance win on some architectures. + // + skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT; + skc_uint const offset = sub_group_scan_inclusive_add(is_elem ? 1 : 0); + + cmd->rasterize.nodeword = e_idx; + + if (is_elem) { + cmds_out[*out_idx + offset] = cmd->rasterize; + } + + *out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1); +} + +// +// +// + +__kernel +SKC_FILLS_EXPAND_KERNEL_ATTRIBS +void +skc_kernel_fills_expand(__global union skc_path_elem const * const blocks, + __global skc_uint volatile * const atomics, + __global skc_block_id_t const * const map, + __global union skc_cmd_fill const * const cmds_in, + __global union skc_cmd_rasterize * const cmds_out) +{ + // + // Need to harmonize the way we determine a subgroup's id. In this + // kernel it's not as important because no local memory is being + // used. Although the device/mask calc to determine subgroup and + // lanes is still proper, we might want to make it clearer that + // we're working with subgroups by using the subgroup API. + // + // every subgroup/simd that will work on the block loads the same command + // +#if (__OPENCL_VERSION__ < 200) + skc_uint const cmd_stride = get_num_sub_groups(); +#else + skc_uint const cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups +#endif + skc_uint cmd_idx = get_group_id(0) * cmd_stride + get_sub_group_id(); + + // load fill command -- we reuse y component + union skc_cmd_expand cmd = { .fill = cmds_in[cmd_idx] }; + + // get the path header block from the map + skc_block_id_t id = map[cmd.fill.path]; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("expand[%u] = %u\n",cmd_idx,id); +#endif + + // + // blindly load all of the head elements into registers + // + skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id(); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE]; + + SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); + + // + // pick out count.nodes and count.prims from the header + // + skc_uint count_nodes, count_prims; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \ + count_nodes = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I); \ + } \ + if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) { \ + count_prims = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I); \ + } + + SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); + + // + // debug of path head + // +#if 0 + skc_uint count_blocks; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \ + count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \ + } + + SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); + + if (get_sub_group_local_id() == 0) + printf("path header = { %5u, %5u, %5u }\n", + count_blocks,count_nodes,count_prims); +#endif + + // + // acquire slots in the expanded cmd extent + // + // decrement prim_idx by 1 so we can use inclusive warp scan later + // + skc_uint out_idx = 0; + + if (get_sub_group_local_id() == 0) { + out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP + (atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1; + } + + out_idx = sub_group_broadcast(out_idx,0); + + // + // process ids trailing the path header + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \ + if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \ + if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \ + h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID; \ + } \ + } \ + skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I, \ + head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \ + } + + SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); + + // + // we're done if it was just the header + // + if (count_nodes == 0) + return; + + // + // otherwise, process the nodes + // + + // + // get id of next node + // + id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST)); + + // + // the following blocks are nodes + // + while (true) + { + // get index of each element + skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id(); + + // + // blindly load all of the node elements into registers + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE]; + + SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); + + // + // append all valid ids + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I, \ + node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); + + SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND(); + + // any more nodes? + if (--count_nodes == 0) + return; + + // + // get id of next node + // + id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST)); + } +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl b/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl index 302ea14af2..63a1a43177 100644 --- a/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl +++ b/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl @@ -1,543 +1,543 @@ -/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "path.h"
-#include "block_pool_cl.h"
-#include "path_builder_cl_12.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#if 0
-
-//
-// SIMD AVX2
-//
-
-#define SKC_PATHS_COPY_WORDS_PER_ELEM 8
-#define SKC_PATHS_COPY_SUBGROUP_SIZE 1
-#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES
-
-typedef skc_uint8 skc_paths_copy_elem;
-typedef skc_uint8 skc_pb_idx_v;
-
-#define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_8()
-
-#define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + I >= SKC_PATH_HEAD_WORDS)
-
-#endif
-
-//
-//
-//
-
-#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)
-#define SKC_PATHS_COPY_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
-#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
-#define SKC_PATHS_COPY_ELEMS_PER_THREAD (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)
-
-// FIXME -- use SUBGROUP terminology everywhere
-#define SKC_PATHS_COPY_SUBGROUP_WORDS (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)
-
-//
-//
-//
-
-#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER \
- (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))
-
-#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER \
- (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))
-
-// #define SKC_PATHS_COPY_HEAD_ELEMS ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)
-
-//
-//
-//
-
-//
-// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL
-//
-
-#define SKC_CMD_PATHS_COPY_ONE_BITS (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
-
-#define SKC_CMD_PATHS_COPY_ONE_MASK SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)
-
-#define SKC_CMD_PATHS_COPY_ONE (1u << SKC_CMD_PATHS_COPY_ONE_BITS)
-
-#define SKC_CMD_PATHS_COPY_GET_TAG(ti) SKC_TAGGED_BLOCK_ID_GET_TAG(ti)
-
-#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti) ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)
-
-#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b) (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))
-
-//
-//
-//
-
-skc_uint
-skc_sub_group_local_id()
-{
-#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
- return get_sub_group_local_id();
-#else
- return 0;
-#endif
-}
-
-//
-// convert an atomic read counter offset to a block id
-//
-
-skc_block_id_t
-skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,
- skc_uint const bp_idx_mask,
- skc_uint const bp_reads,
- skc_uint const bp_off)
-{
- skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;
-
- return bp_ids[bp_idx];
-}
-
-//
-//
-//
-
-void
-skc_copy_segs(__global skc_paths_copy_elem * const bp_elems, // to
- skc_uint const bp_elems_idx,
- __global skc_paths_copy_elem const * const pb_elems, // from
- skc_uint const pb_elems_idx)
-{
- for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
- {
- (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];
- }
-
-#if 0
- //
- // NOTE THIS IS PRINTING 8 ROWS
- //
- printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
- (skc_uint)get_global_id(0),pb_elems_idx,
- as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));
- printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
- (skc_uint)get_global_id(0),pb_elems_idx,
- as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));
-#endif
-}
-
-//
-//
-//
-
-void
-skc_copy_node(__global skc_paths_copy_elem * const bp_elems, // to
- skc_uint const bp_elems_idx,
- __global skc_block_id_t const * const bp_ids,
- skc_uint const bp_reads,
- skc_uint const bp_idx_mask,
- __global skc_paths_copy_elem const * const pb_elems, // from
- skc_uint const pb_elems_idx,
- skc_uint const pb_rolling)
-{
- //
- // remap block id tags bp_elems the host-side rolling counter pb_elems a
- // device-side block pool id
- //
- for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
- {
- // load block_id_tag words
- skc_paths_copy_elem elem = (pb_elems + pb_elems_idx)[ii];
-
- // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
- skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
-
- // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
-
- //
- // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
- // will _always_ be safe as long as we don't use the loaded
- // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
- // of iterating over the vector components.
- //
-
- // only convert if original elem is not invalid
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
- skc_block_id_t const b = bp_ids[bp_idx C]; \
- elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
- }
-
- // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);
-
- SKC_PATHS_COPY_ELEM_EXPAND();
-
- // store the elem back
- (bp_elems+bp_elems_idx)[ii] = elem;
- }
-}
-
-//
-//
-//
-
-void
-skc_host_map_update(__global skc_uint * const host_map,
- skc_uint const block,
- skc_paths_copy_elem const elem)
-{
- //
- // write first elem to map -- FIXME -- this is a little nasty
- // because it relies on the the host handle always being the first
- // word in the path header.
- //
- // OTOH, this is not unreasonable. The alternative is to have a
- // separate kernel initializing the map.
- //
-#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
- if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)
-#endif
- {
-#if SKC_PATHS_COPY_ELEM_WORDS == 1
- host_map[elem] = block;
-#if 0
- printf("[%u] = %u\n",elem,block);
-#endif
-#else
- host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;
-#endif
- }
-}
-
-//
-//
-//
-
-void
-skc_copy_head(__global skc_uint * const host_map,
- skc_uint const block,
- __global skc_paths_copy_elem * const bp_elems, // to
- skc_uint const bp_elems_idx,
- __global skc_block_id_t const * const bp_ids,
- skc_uint const bp_reads,
- skc_uint const bp_idx_mask,
- __global skc_paths_copy_elem const * const pb_elems, // from
- skc_uint const pb_elems_idx,
- skc_uint const pb_rolling)
-{
- //
- // if there are more path header words than there are
- // threads-per-block then we can just copy the initial header words
- //
-#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )
- for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
- {
- skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];
-
- (bp_elems+bp_elems_idx)[ii] = elem;
-
- if (ii == 0) {
- skc_host_map_update(host_map,block,elem);
- }
- }
-#endif
-
- //
- // this is similar to copy node but the first H words of the path
- // header are not modified and simply copied
- //
- for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
- {
- skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
-
-#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )
- if (ii == 0) {
- skc_host_map_update(host_map,block,elem);
- }
-#endif
- // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
- skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
-
- //
- // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
- // will _always_ be safe as long as we don't use the loaded
- // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
- // of iterating over the vector components.
- //
-
- // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
-
- // FIXME -- MIX MIX MIX MIX / SELECT
-
- // only convert if original elem is not invalid
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \
- skc_block_id_t const b = bp_ids[bp_idx C]; \
- elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
- }
-
- // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);
-
- SKC_PATHS_COPY_ELEM_EXPAND();
-
- // store the elem back
- (bp_elems+bp_elems_idx)[ii] = elem;
- }
-
- //
- // the remaining words are treated like a node
- //
- for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
- {
- // load block_id_tag words
- skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
-
- // calculate ahead of time
- skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
-
- //
- // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
- // will _always_ be safe as long as we don't use the loaded
- // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
- // of iterating over the vector components.
- //
-
- // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
-
- // only convert if original elem is not invalid
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
- skc_block_id_t const b = bp_ids[bp_idx C]; \
- elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
- }
-
- // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);
-
- SKC_PATHS_COPY_ELEM_EXPAND();
-
- // store the elem
- (bp_elems+bp_elems_idx)[ii] = elem;
- }
-}
-
-//
-// FIXME -- pack some of these constant integer args in a vec or struct
-//
-
-__kernel
-SKC_PATHS_COPY_KERNEL_ATTRIBS
-void
-skc_kernel_paths_copy
-(__global skc_uint * const host_map,
-
- __global skc_block_id_t const * const bp_ids,
- __global skc_paths_copy_elem * const bp_elems,
- skc_uint const bp_idx_mask, // pow2 modulo mask for block pool ring
-
- __global skc_uint const * const bp_alloc, // block pool ring base
- skc_uint const bp_alloc_idx,// which subbuf
-
- __global union skc_tagged_block_id const * const pb_cmds,
- __global skc_paths_copy_elem const * const pb_elems,
-
- skc_uint const pb_size, // # of commands/blocks in buffer
- skc_uint const pb_rolling, // shifted rolling counter base
-
- skc_uint const pb_prev_from,
- skc_uint const pb_prev_span,
- skc_uint const pb_curr_from)
-{
- //
- // THERE ARE 3 TYPES OF PATH COPYING COMMANDS:
- //
- // - HEAD
- // - NODE
- // - SEGS
- //
- // THESE ARE SUBGROUP ORIENTED KERNELS
- //
- // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS
- //
-
- //
- // It's likely that peak bandwidth is achievable with a single
- // workgroup.
- //
- // So let's keep the grids modestly sized and for simplicity and
- // portability, let's assume that a single workgroup can perform all
- // steps in the copy.
- //
- // Launch as large of a workgroup as possiblex
- //
- // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL
- // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS
- // 3. FOR EACH COMMAND:
- // - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.
- // - NODE: CONVERT AND COPY B INDICES
- // - SEGS: BULK COPY
- //
- // B : number of words in block -- always pow2
- // W : intelligently/arbitrarily chosen factor of B -- always pow2
- //
-
- //
- // There are several approaches to processing the commands:
- //
- // 1. B threads are responsible for one block. All threads broadcast
- // load a single command word. Workgroup size must be a facpb_elemsr of
- // B.
- //
- // 2. W threads process an entire block. W will typically be the
- // device's subgroup/warp/wave width. W threads broadcast load a
- // single command word.
- //
- // 3. W threads process W blocks. W threads load W command words and
- // process W blocks.
- //
- // Clearly (1) has low I/O intensity but will achieve high
- // parallelism by activating the most possible threads. The downside
- // of this kind of approach is that the kernel will occupy even a
- // large GPU with low intensity work and reduce opportunities for
- // concurrent kernel execution (of other kernels).
- //
- // See Vasily Volkov's CUDA presentation describing these tradeoffs.
- //
- // Note that there are many other approaches. For example, similar
- // pb_elems (1) but each thread loads a pow2 vector of block data.
- //
-
- // load the copied atomic read "base" from gmem
- skc_uint const bp_reads = bp_alloc[bp_alloc_idx];
- // will always be less than 2^32
- skc_uint const gid = get_global_id(0);
- // every subgroup/simd that will work on the block loads the same command
- skc_uint const sg_idx = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;
- // path builder data can be spread across two spans
- skc_uint pb_idx = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);
-
- // no need pb_elems make this branchless
- if (pb_idx >= pb_size)
- pb_idx -= pb_size;
-
- // broadcast load the command
- union skc_tagged_block_id const pb_cmd = pb_cmds[pb_idx];
-
- // what do we want pb_elems do with this block?
- skc_cmd_paths_copy_tag const tag = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);
-
- // compute offset from rolling base to get index into block pool ring allocation
- skc_uint const bp_off = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);
-
- // convert the pb_cmd's offset counter pb_elems a block id
- skc_block_id_t const block = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);
-
-#if 0
- if (get_sub_group_local_id() == 0) {
- printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);
- printf("< %8u >\n",block);
- }
-#endif
-
- // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()
- skc_uint const tid = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;
-
- // calculate bp_elems (to) / pb_elems (from)
- skc_uint const bp_elems_idx = block * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;
- skc_uint const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK + tid;
-
- if (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)
- {
-#if 0
- if (tid == 0)
- printf("%3u, segs\n",bp_off);
-#endif
- skc_copy_segs(bp_elems,
- bp_elems_idx,
- pb_elems,
- pb_elems_idx);
- }
- else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)
- {
-#if 0
- if (tid == 0)
- printf("%3u, NODE\n",bp_off);
-#endif
- skc_copy_node(bp_elems, // to
- bp_elems_idx,
- bp_ids,
- bp_reads,
- bp_idx_mask,
- pb_elems, // from
- pb_elems_idx,
- pb_rolling);
- }
- else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)
- {
-#if 0
- if (tid == 0)
- printf("%3u, HEAD\n",bp_off);
-#endif
- skc_copy_head(host_map,
- block,
- bp_elems, // to
- bp_elems_idx,
- bp_ids,
- bp_reads,
- bp_idx_mask,
- pb_elems, // from
- pb_elems_idx,
- pb_rolling);
- }
-}
-
-//
-//
-//
-
-__kernel
-SKC_PATHS_ALLOC_KERNEL_ATTRIBS
-void
-skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,
- __global skc_uint * const bp_alloc,
- skc_uint const bp_alloc_idx,
- skc_uint const pb_cmd_count)
-{
- //
- // allocate blocks in block pool
- //
- skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);
-
- // store in slot
- bp_alloc[bp_alloc_idx] = reads;
-
-#if 0
- printf("pc: %8u + %u\n",reads,pb_cmd_count);
-#endif
-}
-
-//
-//
-//
+/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "path.h" +#include "block_pool_cl.h" +#include "path_builder_cl_12.h" +#include "kernel_cl_12.h" + +// +// +// + +#if 0 + +// +// SIMD AVX2 +// + +#define SKC_PATHS_COPY_WORDS_PER_ELEM 8 +#define SKC_PATHS_COPY_SUBGROUP_SIZE 1 +#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES + +typedef skc_uint8 skc_paths_copy_elem; +typedef skc_uint8 skc_pb_idx_v; + +#define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_8() + +#define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + I >= SKC_PATH_HEAD_WORDS) + +#endif + +// +// +// + +#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK (SKC_PATHS_COPY_SUBGROUP_SIZE - 1) +#define SKC_PATHS_COPY_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS) +#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS) +#define SKC_PATHS_COPY_ELEMS_PER_THREAD (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE) + +// FIXME -- use SUBGROUP terminology everywhere +#define SKC_PATHS_COPY_SUBGROUP_WORDS (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS) + +// +// +// + +#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER \ + (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS)) + +#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER \ + (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS)) + +// #define SKC_PATHS_COPY_HEAD_ELEMS ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS) + +// +// +// + +// +// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL +// + +#define SKC_CMD_PATHS_COPY_ONE_BITS (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2) + +#define SKC_CMD_PATHS_COPY_ONE_MASK SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS) + +#define SKC_CMD_PATHS_COPY_ONE (1u << SKC_CMD_PATHS_COPY_ONE_BITS) + +#define SKC_CMD_PATHS_COPY_GET_TAG(ti) SKC_TAGGED_BLOCK_ID_GET_TAG(ti) + +#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti) ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS) + +#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b) (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG)) + +// +// +// + +skc_uint +skc_sub_group_local_id() +{ +#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1 + return get_sub_group_local_id(); +#else + return 0; +#endif +} + +// +// convert an atomic read counter offset to a block id +// + +skc_block_id_t +skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids, + skc_uint const bp_idx_mask, + skc_uint const bp_reads, + skc_uint const bp_off) +{ + skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask; + + return bp_ids[bp_idx]; +} + +// +// +// + +void +skc_copy_segs(__global skc_paths_copy_elem * const bp_elems, // to + skc_uint const bp_elems_idx, + __global skc_paths_copy_elem const * const pb_elems, // from + skc_uint const pb_elems_idx) +{ + for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) + { + (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii]; + } + +#if 0 + // + // NOTE THIS IS PRINTING 8 ROWS + // + printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n", + (skc_uint)get_global_id(0),pb_elems_idx, + as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]), + as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]), + as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]), + as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE])); + printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n", + (skc_uint)get_global_id(0),pb_elems_idx, + as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]), + as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]), + as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]), + as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE])); +#endif +} + +// +// +// + +void +skc_copy_node(__global skc_paths_copy_elem * const bp_elems, // to + skc_uint const bp_elems_idx, + __global skc_block_id_t const * const bp_ids, + skc_uint const bp_reads, + skc_uint const bp_idx_mask, + __global skc_paths_copy_elem const * const pb_elems, // from + skc_uint const pb_elems_idx, + skc_uint const pb_rolling) +{ + // + // remap block id tags bp_elems the host-side rolling counter pb_elems a + // device-side block pool id + // + for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) + { + // load block_id_tag words + skc_paths_copy_elem elem = (pb_elems + pb_elems_idx)[ii]; + + // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid + skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask; + + // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS + + // + // FIXME -- SIMD can be fully parallelized since a bp_ids[] load + // will _always_ be safe as long as we don't use the loaded + // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead + // of iterating over the vector components. + // + + // only convert if original elem is not invalid + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \ + skc_block_id_t const b = bp_ids[bp_idx C]; \ + elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \ + } + + // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C); + + SKC_PATHS_COPY_ELEM_EXPAND(); + + // store the elem back + (bp_elems+bp_elems_idx)[ii] = elem; + } +} + +// +// +// + +void +skc_host_map_update(__global skc_uint * const host_map, + skc_uint const block, + skc_paths_copy_elem const elem) +{ + // + // write first elem to map -- FIXME -- this is a little nasty + // because it relies on the the host handle always being the first + // word in the path header. + // + // OTOH, this is not unreasonable. The alternative is to have a + // separate kernel initializing the map. + // +#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1 + if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE) +#endif + { +#if SKC_PATHS_COPY_ELEM_WORDS == 1 + host_map[elem] = block; +#if 0 + printf("[%u] = %u\n",elem,block); +#endif +#else + host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block; +#endif + } +} + +// +// +// + +void +skc_copy_head(__global skc_uint * const host_map, + skc_uint const block, + __global skc_paths_copy_elem * const bp_elems, // to + skc_uint const bp_elems_idx, + __global skc_block_id_t const * const bp_ids, + skc_uint const bp_reads, + skc_uint const bp_idx_mask, + __global skc_paths_copy_elem const * const pb_elems, // from + skc_uint const pb_elems_idx, + skc_uint const pb_rolling) +{ + // + // if there are more path header words than there are + // threads-per-block then we can just copy the initial header words + // +#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 ) + for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) + { + skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii]; + + (bp_elems+bp_elems_idx)[ii] = elem; + + if (ii == 0) { + skc_host_map_update(host_map,block,elem); + } + } +#endif + + // + // this is similar to copy node but the first H words of the path + // header are not modified and simply copied + // + for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) + { + skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii]; + +#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 ) + if (ii == 0) { + skc_host_map_update(host_map,block,elem); + } +#endif + // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid + skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask; + + // + // FIXME -- SIMD can be fully parallelized since a bp_ids[] load + // will _always_ be safe as long as we don't use the loaded + // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead + // of iterating over the vector components. + // + + // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS + + // FIXME -- MIX MIX MIX MIX / SELECT + + // only convert if original elem is not invalid +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \ + skc_block_id_t const b = bp_ids[bp_idx C]; \ + elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \ + } + + // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C); + + SKC_PATHS_COPY_ELEM_EXPAND(); + + // store the elem back + (bp_elems+bp_elems_idx)[ii] = elem; + } + + // + // the remaining words are treated like a node + // + for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) + { + // load block_id_tag words + skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii]; + + // calculate ahead of time + skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask; + + // + // FIXME -- SIMD can be fully parallelized since a bp_ids[] load + // will _always_ be safe as long as we don't use the loaded + // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead + // of iterating over the vector components. + // + + // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS + + // only convert if original elem is not invalid +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \ + skc_block_id_t const b = bp_ids[bp_idx C]; \ + elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \ + } + + // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C); + + SKC_PATHS_COPY_ELEM_EXPAND(); + + // store the elem + (bp_elems+bp_elems_idx)[ii] = elem; + } +} + +// +// FIXME -- pack some of these constant integer args in a vec or struct +// + +__kernel +SKC_PATHS_COPY_KERNEL_ATTRIBS +void +skc_kernel_paths_copy +(__global skc_uint * const host_map, + + __global skc_block_id_t const * const bp_ids, + __global skc_paths_copy_elem * const bp_elems, + skc_uint const bp_idx_mask, // pow2 modulo mask for block pool ring + + __global skc_uint const * const bp_alloc, // block pool ring base + skc_uint const bp_alloc_idx,// which subbuf + + __global union skc_tagged_block_id const * const pb_cmds, + __global skc_paths_copy_elem const * const pb_elems, + + skc_uint const pb_size, // # of commands/blocks in buffer + skc_uint const pb_rolling, // shifted rolling counter base + + skc_uint const pb_prev_from, + skc_uint const pb_prev_span, + skc_uint const pb_curr_from) +{ + // + // THERE ARE 3 TYPES OF PATH COPYING COMMANDS: + // + // - HEAD + // - NODE + // - SEGS + // + // THESE ARE SUBGROUP ORIENTED KERNELS + // + // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS + // + + // + // It's likely that peak bandwidth is achievable with a single + // workgroup. + // + // So let's keep the grids modestly sized and for simplicity and + // portability, let's assume that a single workgroup can perform all + // steps in the copy. + // + // Launch as large of a workgroup as possiblex + // + // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL + // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS + // 3. FOR EACH COMMAND: + // - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES. + // - NODE: CONVERT AND COPY B INDICES + // - SEGS: BULK COPY + // + // B : number of words in block -- always pow2 + // W : intelligently/arbitrarily chosen factor of B -- always pow2 + // + + // + // There are several approaches to processing the commands: + // + // 1. B threads are responsible for one block. All threads broadcast + // load a single command word. Workgroup size must be a facpb_elemsr of + // B. + // + // 2. W threads process an entire block. W will typically be the + // device's subgroup/warp/wave width. W threads broadcast load a + // single command word. + // + // 3. W threads process W blocks. W threads load W command words and + // process W blocks. + // + // Clearly (1) has low I/O intensity but will achieve high + // parallelism by activating the most possible threads. The downside + // of this kind of approach is that the kernel will occupy even a + // large GPU with low intensity work and reduce opportunities for + // concurrent kernel execution (of other kernels). + // + // See Vasily Volkov's CUDA presentation describing these tradeoffs. + // + // Note that there are many other approaches. For example, similar + // pb_elems (1) but each thread loads a pow2 vector of block data. + // + + // load the copied atomic read "base" from gmem + skc_uint const bp_reads = bp_alloc[bp_alloc_idx]; + // will always be less than 2^32 + skc_uint const gid = get_global_id(0); + // every subgroup/simd that will work on the block loads the same command + skc_uint const sg_idx = gid / SKC_PATHS_COPY_SUBGROUP_SIZE; + // path builder data can be spread across two spans + skc_uint pb_idx = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from); + + // no need pb_elems make this branchless + if (pb_idx >= pb_size) + pb_idx -= pb_size; + + // broadcast load the command + union skc_tagged_block_id const pb_cmd = pb_cmds[pb_idx]; + + // what do we want pb_elems do with this block? + skc_cmd_paths_copy_tag const tag = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32); + + // compute offset from rolling base to get index into block pool ring allocation + skc_uint const bp_off = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling); + + // convert the pb_cmd's offset counter pb_elems a block id + skc_block_id_t const block = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off); + +#if 0 + if (get_sub_group_local_id() == 0) { + printf("bp_off/reads = %u / %u\n",bp_off,bp_reads); + printf("< %8u >\n",block); + } +#endif + + // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id() + skc_uint const tid = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK; + + // calculate bp_elems (to) / pb_elems (from) + skc_uint const bp_elems_idx = block * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid; + skc_uint const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK + tid; + + if (tag == SKC_CMD_PATHS_COPY_TAG_SEGS) + { +#if 0 + if (tid == 0) + printf("%3u, segs\n",bp_off); +#endif + skc_copy_segs(bp_elems, + bp_elems_idx, + pb_elems, + pb_elems_idx); + } + else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE) + { +#if 0 + if (tid == 0) + printf("%3u, NODE\n",bp_off); +#endif + skc_copy_node(bp_elems, // to + bp_elems_idx, + bp_ids, + bp_reads, + bp_idx_mask, + pb_elems, // from + pb_elems_idx, + pb_rolling); + } + else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD) + { +#if 0 + if (tid == 0) + printf("%3u, HEAD\n",bp_off); +#endif + skc_copy_head(host_map, + block, + bp_elems, // to + bp_elems_idx, + bp_ids, + bp_reads, + bp_idx_mask, + pb_elems, // from + pb_elems_idx, + pb_rolling); + } +} + +// +// +// + +__kernel +SKC_PATHS_ALLOC_KERNEL_ATTRIBS +void +skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics, + __global skc_uint * const bp_alloc, + skc_uint const bp_alloc_idx, + skc_uint const pb_cmd_count) +{ + // + // allocate blocks in block pool + // + skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count); + + // store in slot + bp_alloc[bp_alloc_idx] = reads; + +#if 0 + printf("pc: %8u + %u\n",reads,pb_cmd_count); +#endif +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl b/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl index 2aee5dac17..5441dcdec7 100644 --- a/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl +++ b/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl @@ -1,390 +1,390 @@ -/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-// FIXME -- a pre-allocation step could load the path header quads and
-// total up the number of blocks in the workgroup or subgroup
-// minimizing the number of later atomics adds.
-//
-
-#include "block.h"
-#include "path.h"
-#include "common.h"
-#include "atomic_cl.h"
-#include "block_pool_cl.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
-
-#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS)
-
-#define SKC_PATHS_RECLAIM_X (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS)
-
-//
-//
-//
-
-#if ( SKC_PATHS_RECLAIM_X == 1 )
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 0
-
-#elif ( SKC_PATHS_RECLAIM_X == 2 )
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 1
-
-#elif ( SKC_PATHS_RECLAIM_X == 4 )
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 3
-
-#elif ( SKC_PATHS_RECLAIM_X == 8 )
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 7
-
-#elif ( SKC_PATHS_RECLAIM_X == 16)
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 15
-
-#else
-#error "MISSING SKC_PATHS_RECLAIM_X"
-#endif
-
-//
-// FIXME -- slate these for replacement
-//
-
-#define SKC_BROADCAST(E,S,I) \
- sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_BROADCAST_LAST_HELPER(E,I) \
- sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
-
-#define SKC_BROADCAST_LAST(E,I) \
- SKC_BROADCAST_LAST_HELPER(E,I)
-
-//
-// COMPILE-TIME PREDICATES
-//
-
-#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I) \
- SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I) \
- (skc_bool)SKC_GTE_MACRO(X, I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) && \
- (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I) \
- SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I)
-
-#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I) \
- SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I)
-
-//
-// RUN-TIME PREDICATES
-//
-
-#define SKC_PATHS_RECLAIM_IS_HEADER(I) \
- (get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS)
-
-//
-// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
-// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
-// COMBOS (NOT NECESSARILY POW2)
-//
-// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
-// UINT TYPE INSTEAD OF A ULONG.
-//
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE skc_uint
-
-//
-//
-//
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS)
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \
- (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \
- ? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I))
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \
- S = sub_group_scan_exclusive_add(C)
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I) \
- (((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK)
-
-//
-//
-//
-
-struct skc_reclaim
-{
- skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE];
-};
-
-__kernel
-SKC_PATHS_RECLAIM_KERNEL_ATTRIBS
-void
-skc_kernel_paths_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring
- __global skc_uint * const bp_elems, // block pool blocks
- __global skc_uint volatile * const bp_atomics, // read/write atomics
- skc_uint const bp_mask, // pow2 modulo mask for block pool ring
- __global skc_block_id_t const * const map, // path host-to-device map
- struct skc_reclaim const reclaim) // array of host path ids
-{
-#if (__OPENCL_VERSION__ < 200)
- skc_uint const reclaim_stride = get_num_sub_groups();
-#else
- skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
-#endif
- skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id();
-
-#if 0
- //
- // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
- // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
- // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
- // RECLAMATION JOB ON THE REST OF THE PIPELINE.
- //
- for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
-#endif
- {
- // get host path id
- skc_path_h const path = reclaim.aN[reclaim_idx];
-
- // get the path header block from the map
- skc_block_id_t id = map[path];
-
- //
- // blindly load all of the head elements into registers
- //
- skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- //
- // pick out count.nodes and count.prims from the header
- //
- skc_uint count_blocks, count_nodes;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
- count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
- } \
- if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
- count_nodes = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \
- }
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-#if 0
- if (get_sub_group_local_id() == 0) {
- printf("reclaim paths: %9u / %5u / %5u\n",path,count_blocks,count_nodes);
- }
-#endif
-
- //
- // acquire a span in the block pool ids ring for reclaimed ids
- //
- // FIXME count_blocks and atomic add can be done in same lane
- //
- skc_uint bp_ids_base = 0;
-
- if (get_sub_group_local_id() == 0) {
- bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
-
-#if 0
- printf("paths: bp_ids_base = %u\n",bp_ids_base);
-#endif
- }
-
- bp_ids_base = sub_group_broadcast(bp_ids_base,0);
-
- //
- // shift away the tagged block id's tag
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
- h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; \
- }
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- //
- // swap current id with next
- //
- if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
- {
- skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
-
- SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
-
- id = next;
- }
-
- //
- // - we'll skip subgroups that are entirely header
- //
- // - but we need to mark any header elements that partially fill
- // a subgroup as invalid tagged block ids
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
- if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) { \
- if (SKC_PATHS_RECLAIM_IS_HEADER(I)) { \
- h##I = SKC_TAGGED_BLOCK_ID_INVALID; \
- } \
- } \
- }
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- {
- //
- // count reclaimable blocks in each lane
- //
- SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
- packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
- }
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- //
- // scan to find index of each block
- //
- SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
-
- SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
-
- //
- // store blocks back to ring
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
- skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
- skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
- skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
- if (count > 0) { \
- bp_ids[bp_ids_idx] = h##I; \
- } \
- skc_uint const total = index + count; \
- bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
- }
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- // printf("P %7u ! %u\n",bp_ids_idx,h##I);
- }
-
- //
- // we're done if it was just the header
- //
- if (count_nodes == 0)
- return;
-
- //
- // otherwise, walk the nodes
- //
- do {
- // id of next block is in last lane
- id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1);
-
- // get index of each element
- skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
-
- //
- // blindly load all of the node elements into registers
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- //
- // shift away the tagged block id's tag
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- //
- // swap current id with next
- //
- if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
- {
- skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
-
- SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
-
- id = next;
- }
-
- //
- // count reclaimable blocks in each lane
- //
- SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- //
- // scan to find index of each block
- //
- SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
-
- SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
-
- //
- // store blocks back to ring
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) { \
- skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
- skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
- skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
- if (count > 0) { \
- bp_ids[bp_ids_idx] = n##I; \
- } \
- skc_uint const total = index + count; \
- bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
- }
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- // printf("P %7u ! %u\n",bp_ids_idx,n##I);
-
- // any more nodes?
- } while (--count_nodes > 0);
- }
-}
-
-//
-//
-//
+/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// FIXME -- a pre-allocation step could load the path header quads and +// total up the number of blocks in the workgroup or subgroup +// minimizing the number of later atomics adds. +// + +#include "block.h" +#include "path.h" +#include "common.h" +#include "atomic_cl.h" +#include "block_pool_cl.h" +#include "kernel_cl_12.h" + +// +// +// + +#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) + +#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS) + +#define SKC_PATHS_RECLAIM_X (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS) + +// +// +// + +#if ( SKC_PATHS_RECLAIM_X == 1 ) +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1() +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 0 + +#elif ( SKC_PATHS_RECLAIM_X == 2 ) +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2() +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 1 + +#elif ( SKC_PATHS_RECLAIM_X == 4 ) +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4() +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 3 + +#elif ( SKC_PATHS_RECLAIM_X == 8 ) +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8() +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 7 + +#elif ( SKC_PATHS_RECLAIM_X == 16) +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16() +#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 15 + +#else +#error "MISSING SKC_PATHS_RECLAIM_X" +#endif + +// +// FIXME -- slate these for replacement +// + +#define SKC_BROADCAST(E,S,I) \ + sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) + +#define SKC_BROADCAST_LAST_HELPER(E,I) \ + sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) + +#define SKC_BROADCAST_LAST(E,I) \ + SKC_BROADCAST_LAST_HELPER(E,I) + +// +// COMPILE-TIME PREDICATES +// + +#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I) \ + SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) + +#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I) \ + (skc_bool)SKC_GTE_MACRO(X, I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) && \ + (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) + +#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I) \ + SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I) + +#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I) \ + SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I) + +// +// RUN-TIME PREDICATES +// + +#define SKC_PATHS_RECLAIM_IS_HEADER(I) \ + (get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS) + +// +// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL +// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK +// COMBOS (NOT NECESSARILY POW2) +// +// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR +// UINT TYPE INSTEAD OF A ULONG. +// + +#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2 +#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE skc_uint + +// +// +// + +#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS) + +#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \ + (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \ + ? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) + +#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \ + S = sub_group_scan_exclusive_add(C) + +#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I) \ + (((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK) + +// +// +// + +struct skc_reclaim +{ + skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE]; +}; + +__kernel +SKC_PATHS_RECLAIM_KERNEL_ATTRIBS +void +skc_kernel_paths_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring + __global skc_uint * const bp_elems, // block pool blocks + __global skc_uint volatile * const bp_atomics, // read/write atomics + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const map, // path host-to-device map + struct skc_reclaim const reclaim) // array of host path ids +{ +#if (__OPENCL_VERSION__ < 200) + skc_uint const reclaim_stride = get_num_sub_groups(); +#else + skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups +#endif + skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id(); + +#if 0 + // + // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT + // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL + // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE + // RECLAMATION JOB ON THE REST OF THE PIPELINE. + // + for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride) +#endif + { + // get host path id + skc_path_h const path = reclaim.aN[reclaim_idx]; + + // get the path header block from the map + skc_block_id_t id = map[path]; + + // + // blindly load all of the head elements into registers + // + skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE]; + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + // + // pick out count.nodes and count.prims from the header + // + skc_uint count_blocks, count_nodes; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \ + count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \ + } \ + if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \ + count_nodes = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \ + } + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + +#if 0 + if (get_sub_group_local_id() == 0) { + printf("reclaim paths: %9u / %5u / %5u\n",path,count_blocks,count_nodes); + } +#endif + + // + // acquire a span in the block pool ids ring for reclaimed ids + // + // FIXME count_blocks and atomic add can be done in same lane + // + skc_uint bp_ids_base = 0; + + if (get_sub_group_local_id() == 0) { + bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks); + +#if 0 + printf("paths: bp_ids_base = %u\n",bp_ids_base); +#endif + } + + bp_ids_base = sub_group_broadcast(bp_ids_base,0); + + // + // shift away the tagged block id's tag + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ + h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; \ + } + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + // + // swap current id with next + // + if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) + { + skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST); + + SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; + + id = next; + } + + // + // - we'll skip subgroups that are entirely header + // + // - but we need to mark any header elements that partially fill + // a subgroup as invalid tagged block ids + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ + if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) { \ + if (SKC_PATHS_RECLAIM_IS_HEADER(I)) { \ + h##I = SKC_TAGGED_BLOCK_ID_INVALID; \ + } \ + } \ + } + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + { + // + // count reclaimable blocks in each lane + // + SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ + packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \ + } + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + // + // scan to find index of each block + // + SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); + + SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); + + // + // store blocks back to ring + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ + skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ + skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ + skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ + if (count > 0) { \ + bp_ids[bp_ids_idx] = h##I; \ + } \ + skc_uint const total = index + count; \ + bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \ + } + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + // printf("P %7u ! %u\n",bp_ids_idx,h##I); + } + + // + // we're done if it was just the header + // + if (count_nodes == 0) + return; + + // + // otherwise, walk the nodes + // + do { + // id of next block is in last lane + id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); + + // get index of each element + skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); + + // + // blindly load all of the node elements into registers + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE]; + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + // + // shift away the tagged block id's tag + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + // + // swap current id with next + // + if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) + { + skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST); + + SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; + + id = next; + } + + // + // count reclaimable blocks in each lane + // + SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I); + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + // + // scan to find index of each block + // + SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); + + SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); + + // + // store blocks back to ring + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) { \ + skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ + skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ + skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ + if (count > 0) { \ + bp_ids[bp_ids_idx] = n##I; \ + } \ + skc_uint const total = index + count; \ + bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \ + } + + SKC_PATHS_RECLAIM_BLOCK_EXPAND(); + + // printf("P %7u ! %u\n",bp_ids_idx,n##I); + + // any more nodes? + } while (--count_nodes > 0); + } +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/place.cl b/src/compute/skc/platforms/cl_12/kernels/place.cl index 92fa0a243d..8866bdb3e6 100644 --- a/src/compute/skc/platforms/cl_12/kernels/place.cl +++ b/src/compute/skc/platforms/cl_12/kernels/place.cl @@ -1,871 +1,871 @@ -/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "tile.h"
-#include "common.h"
-#include "raster.h"
-#include "atomic_cl.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define SKC_PLACE_SUBGROUP_MASK (SKC_PLACE_SUBGROUP_SIZE - 1)
-#define SKC_PLACE_SUBGROUP_LAST (SKC_PLACE_SUBGROUP_SIZE - 1)
-
-//
-//
-//
-
-#define SKC_PLACE_SMEM_COUNT_TTSK SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_SMEM_COUNT_TTPK SKC_RASTER_NODE_MAX_TTPK
-
-//
-//
-//
-
-#define SKC_PLACE_X (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE)
-
-//
-//
-//
-
-#if ( SKC_PLACE_X == 1 )
-#define SKC_PLACE_EXPAND() SKC_EXPAND_1()
-#define SKC_PLACE_EXPAND_I_LAST 0
-
-#elif ( SKC_PLACE_X == 2 )
-#define SKC_PLACE_EXPAND() SKC_EXPAND_2()
-#define SKC_PLACE_EXPAND_I_LAST 1
-
-#elif ( SKC_PLACE_X == 4 )
-#define SKC_PLACE_EXPAND() SKC_EXPAND_4()
-#define SKC_PLACE_EXPAND_I_LAST 3
-
-#elif ( SKC_PLACE_X == 8 )
-#define SKC_PLACE_EXPAND() SKC_EXPAND_8()
-#define SKC_PLACE_EXPAND_I_LAST 7
-
-#elif ( SKC_PLACE_X == 16)
-#define SKC_PLACE_EXPAND() SKC_EXPAND_16()
-#define SKC_PLACE_EXPAND_I_LAST 15
-#endif
-
-//
-// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE
-// COALESCED WRITES. LO FIRST, FOLLOWED BY HI.
-//
-// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE
-// KERNELS USE DIFFERENT SUBGROUP SIZES.
-//
-// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE
-// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID.
-//
-// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER
-// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY
-// ONLY SUPPORT A SUBGROUP SIZE OF 16.
-//
-
-#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE )
-
-#define SKC_PLACE_STRIDE_H(L) (L)
-#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE)
-
-#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_PLACE_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_RATIO - 1)
-#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I) ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK))
-
-#define SKC_PLACE_STRIDE_H(L) (L)
-#define SKC_PLACE_STRIDE_V_LO(I) (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE)
-
-#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_PLACE_SUBGROUP_RATIO (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
-#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
-
-#define SKC_PLACE_STRIDE_H(L) (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK))
-#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO)
-
-#endif
-
-//
-// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE
-// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8)
-//
-
-#define SKC_PLACE_IS_ALL_HEADER_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS)
-
-#define SKC_PLACE_IS_NOT_HEADER_ROW(i) ( (i) * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS)
-
-#define SKC_PLACE_IS_TRAILING_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS)
-
-#define SKC_PLACE_IS_HEADER_ROW_KEY(i) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
-
-
-//
-// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX
-//
-#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
-#define SKC_PLACE_NODE_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() < (k))
-
-//
-// TTSK v2:
-//
-// 0 63
-// | TTSB ID | PREFIX | SPAN | X | Y |
-// +---------+--------+---------+-----+-----+
-// | 27 | 1 (=0) | 12 (=0) | 12 | 12 |
-//
-//
-// TTPK v2:
-//
-// 0 63
-// | TTPB ID | PREFIX | SPAN | X | Y |
-// +---------+--------+------+-----+-----+
-// | 27 | 1 (=1) | 12 | 12 | 12 |
-//
-//
-
-//
-// TTCK (32-BIT COMPARE) v1:
-//
-// 0 63
-// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
-// +----------------------+--------+--------+-------+-----+-----+
-// | 30 | 1 | 1 | 18 | 7 | 7 |
-//
-//
-// TTCK (32-BIT COMPARE) v2:
-//
-// 0 63
-// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
-// +----------------------+--------+--------+-------+-----+-----+
-// | 30 | 1 | 1 | 15 | 9 | 8 |
-//
-//
-// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
-//
-// 0 63
-// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
-// +----------------------+--------+--------+-------+-----+-----+
-// | 27 | 1 | 1 | 18 | 9 | 8 |
-//
-
-union skc_subgroup_smem
-{
- skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE
-
- struct {
- struct {
- skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
- skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
- } lo;
-
- struct {
- skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
- skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
- } hi;
-
- // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK];
- };
-
-};
-
-//
-// scatter scan max
-//
-static
-skc_int_v_t
-skc_scatter_scan_max(__local union skc_subgroup_smem volatile * const smem,
- skc_int_v_t const iss,
- skc_int_v_t const ess)
-{
- //
- // prefix sums determine which lanes we're going to work on next
- //
- skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE);
- skc_int_v_t const scratch_idx = max(ess,0);
-
- //
- // SIMT
- //
-
- //
- // zero the volatile smem scratchpad using vector syntax
- //
- smem->scratch[get_sub_group_local_id()] = ( 0 );
-
- //
- // store source lane at starting lane
- //
- if (is_scratch_store) {
- smem->scratch[scratch_idx] = get_sub_group_local_id();
- }
-
- //
- // propagate lanes to right using max scan
- //
- skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()];
- skc_int_v_t const source = sub_group_scan_inclusive_max(scratch);
-
- return source;
-}
-
-//
-//
-//
-
-static
-skc_bool
-skc_xk_clip(union skc_tile_clip const * const tile_clip,
- skc_ttxk_t * const xk)
-{
- //
- // clip the sk and pk keys
- //
- // if fully clipped then return false
- //
- // alternatively -- we can expand all these keys in place
- //
- // alternatively -- keep sk and pk keys segregated because sk
- // represents the vast majority of keys and are easier to process.
- // don't mess with the fastpath!
- //
- return false;
-}
-
-//
-//
-//
-
-static
-skc_ttck_t
-skc_sk_to_ck(__local union skc_subgroup_smem volatile * const smem,
- union skc_cmd_place const * const cmd,
- skc_uint const sk_idx)
-{
- skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0
- skc_uint const hi = smem->hi.sk[sk_idx];
-
- skc_ttck_t ck;
-
- ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
-
- // FIXME -- x and y should already be clipped and shifted
- skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
- skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
-
- ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
-
- return ck;
-}
-
-static
-skc_ttck_t
-skc_pk_to_ck(__local union skc_subgroup_smem volatile * const smem,
- union skc_cmd_place const * const cmd,
- skc_uint const pk_idx,
- skc_uint const dx)
-{
- skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1
- skc_uint const hi = smem->hi.pk[pk_idx];
-
- skc_ttck_t ck;
-
- ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
-
- // FIXME -- x and y should already be clipped and shifted
- skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
- skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
-
- ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
-
- return ck;
-}
-
-//
-//
-//
-
-static
-void
-skc_ttsk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics,
- __global skc_ttck_t * const ck_extent,
- __local union skc_subgroup_smem volatile * const smem,
- union skc_cmd_place const * const cmd,
- skc_uint const sk)
-{
- //
- // Pretty sure you can never ever have an sk count equal to 0
- //
- skc_uint ck_base = 0;
-
- // last lane performs the block pool allocation with an atomic increment
- if (get_sub_group_local_id() == 0) {
- ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk);
- }
-
- // broadcast base to all lanes
- ck_base = sub_group_broadcast(ck_base,0);
-
- // convert sk keys to ck keys
- for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE)
- {
- ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii);
- }
-}
-
-//
-//
-//
-
-static
-skc_int
-skc_ttpk_get_span(__local union skc_subgroup_smem volatile * const smem,
- skc_uint const idx)
-{
- skc_uint const lo = smem->lo.pk[idx];
- skc_uint const hi = smem->hi.pk[idx];
-
- skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN;
- skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN;
-
- return (span_lo | span_hi) + 1;
-}
-
-//
-//
-//
-
-static
-void
-skc_ttpk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics,
- __global skc_ttck_t * const ck_extent,
- __local union skc_subgroup_smem volatile * const smem,
- union skc_cmd_place const * const cmd,
- skc_uint const pk)
-{
- // bail out if pk queue is empty
- if (pk == 0)
- return;
-
-#if 0
- if (get_sub_group_local_id() == 0)
- printf("%u\n",pk);
-#endif
-
- //
- // FIXME -- this nested loop iterates over the queue processing a
- // subgroup of 64-bit keys at a time. This is probably not the most
- // efficient approach so investigate how to store and iterate over a
- // wider than subgroup (node-sized) queue of keys.
- //
-
- // round up so we work with full subgroups
- skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK;
- skc_uint ii = 0;
-
- // nested loop that expands all ttpk keys
-#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE)
- for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE)
-#endif
- {
- skc_uint idx = ii + get_sub_group_local_id();
- skc_int span = 0;
-
- // how many tiles does this ttpk span?
- if (idx < pk)
- span = skc_ttpk_get_span(smem,idx);
-
- // we need inclusive, exclusive and total
- skc_int iss = sub_group_scan_inclusive_add(span);
- skc_int ess = iss - span;
- skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1);
-
- // printf("%u : %u\n",span,iss);
- // continue;
-
- // atomically allocate space for the pk keys
- skc_uint ck_base = 0;
-
- // last lane performs the block pool allocation with an atomic increment
- if (get_sub_group_local_id() == 0) {
- ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem);
- }
-
- // broadcast atomically allocated extent base to all lanes
- skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id();
-
- //
- // FIXME -- this loop would probably be faster if the ttpk keys
- // were held in registers and accessed with shuffles instead of
- // SMEM loads
- //
-
- //
- // loop until there are no more expanded pk keys
- //
- while (true)
- {
- skc_int const source = skc_scatter_scan_max(smem,iss,ess);
- skc_int const dx = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source);
-
- // store valid ck keys to gmem
- if (get_sub_group_local_id() < rem) {
- ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx);
- }
-
- // decrement remainder
- rem -= SKC_PLACE_SUBGROUP_SIZE;
-
- if (rem <= 0)
- break;
-
- // increment/decrement indices
- ck_idx += SKC_PLACE_SUBGROUP_SIZE;
- iss -= SKC_PLACE_SUBGROUP_SIZE;
- ess -= SKC_PLACE_SUBGROUP_SIZE;
- }
- }
-}
-
-//
-//
-//
-
-static
-skc_uint
-skc_ballot(skc_uint * const xk, skc_uint const is_xk)
-{
-#if 0
- //
- // FIXME -- when available, this should use the idiom:
- //
- // ballot() + lane_mask_less_than_or_equal + popcount()
- //
- // Supported by:
- //
- // - Vulkan 1.1 / SPIR-V 1.3
- // - CUDA
- // - AVX2 (SSE*?)
- //
-#else
- //
- // otherwise, emulate with an inclusive scan (yuk)
- //
- skc_uint const prefix = sub_group_scan_inclusive_add(is_xk);
-
- skc_uint const xk_idx = *xk + prefix - is_xk;
-
- *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST);
-
-#if 0
- printf("< %3u >\n",xk_idx);
-#endif
-
- return xk_idx;
-#endif
-}
-
-//
-//
-//
-__kernel
-SKC_PLACE_KERNEL_ATTRIBS
-void
-skc_kernel_place(__global skc_bp_elem_t * const bp_elems,
- __global SKC_ATOMIC_UINT volatile * const place_atomics,
- __global skc_ttck_t * const ck_extent,
- __global union skc_cmd_place const * const cmds,
- __global skc_block_id_t * const map,
- skc_uint4 const clip,
- skc_uint const count)
-{
- //
- // declare shared memory block
- //
-#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
- __local union skc_subgroup_smem volatile smem[1];
-#else
- __local union skc_subgroup_smem volatile smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS];
- __local union skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
-#endif
-
- //
- // This is a subgroup-centric kernel
- //
- // Which subgroup in the grid is this?
- //
- // TAKE NOTE: the Intel GEN compiler appears to be recognizing
- // get_group_id(0) as a uniform but the alternative calculation used
- // when there are multiple subgroups per workgroup is not
- // cooperating and driving spillage elsewhere.
- //
- // Test the raster's translated bounds against the composition's
- // tile clip
- //
- // There are 3 cases:
- //
- // - the raster is completely clipped -> return
- // - the raster is partially clipped -> all keys must clipped
- // - the raster is not clipped -> no keys are tested
- //
- //
- // There are at least 4 implementations of place and we want to
- // special-case them as much as possible so that, at the least, the
- // fastpath remains fast.
- //
- // - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP
- //
- // - implement CLIPPED + NO TRANSLATION path
- //
- // - implement NO CLIP + TRANSLATION path
- //
- // - implement CLIPPED + TRANSLATION path
- //
- //
- // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin
- // 12:12:8 integer where:
- //
- // 12: ttsk
- // 12: ttpk
- // 8: /dev/null -- clipped or invalid key
- //
- // Three kinds of nodes in a raster's list:
- //
- // - the head node
- // - an internal node
- // - the final node
- //
-
-#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
- skc_uint const cmd_idx = get_group_id(0);
-#else
- skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
- // load command
- union skc_cmd_place const cmd = cmds[cmd_idx];
-
- // get the raster header from the raster host id -- scalar
- skc_block_id_t id = map[cmd.raster_h];
-
- //
- // load all of the head block ttxk keys into registers
- //
- // FIXME -- this pattern lends itself to using the higher
- // performance Intel GEN block load instructions
- //
- skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- union skc_raster_node_elem const h##I = { \
- .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)], \
- bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)] } \
- };
-
- SKC_PLACE_EXPAND();
-
- //
- // load raster header counts -- we only need the "nodes" and "keys"
- // words but the keys we loaded are doublewords.
- //
- // FIXME -- this can be made portable with compile-time macro expansion
- //
- skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
- skc_uint keys = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
-
- //
- //
- //
-#if 0
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \
- nodes,keys, \
- I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \
- h##I.u32v2.hi,h##I.u32v2.lo, \
- h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
-
- SKC_PLACE_EXPAND();
-#endif
-
- //
-#if 0
- if (get_sub_group_local_id() == 0) {
- printf("place: %u / %u / %u\n",head_id,nodes,keys);
- }
-#endif
-
- {
- //
- // classify every key in the header
- //
- // keys: 0 is not a key / 1 is a key
- // skpk: 0 is sk / 1 is pk
- //
- skc_uint bits_keys = 0;
- skc_uint bits_skpk = 0;
-
- //
- // calculate bits_keys
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
- skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \
- if (idx < keys) { \
- bits_keys |= (1u << I); \
- } \
- if (SKC_PLACE_IS_TRAILING_ROW(I)) { \
- if (keys > SKC_RASTER_HEAD_COUNT_KEYS) { \
- if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \
- bits_keys &= ~(1u << I); \
- } \
- } \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
- //
- // blindly calculate bits_skpk
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
- bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
- }
-
- SKC_PLACE_EXPAND();
-
-#if 0
- printf("%2X : %2X\n",bits_keys,bits_skpk);
-#endif
-
- //
- // next pointer is last element of last row. save it now because
- // this might be recognized as a subgroup-uniform/scalar.
- //
- id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
-
- //
- // append SK keys first
- //
- skc_uint const bits_sk = bits_keys & ~bits_skpk;
- skc_uint sk = 0;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
- skc_uint is_sk = (bits_sk >> I) & 1; \
- skc_uint sk_idx = skc_ballot(&sk,is_sk); \
- if (is_sk) { \
- smem->lo.sk[sk_idx] = h##I.xk.lo; \
- smem->hi.sk[sk_idx] = h##I.xk.hi; \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
- //
- // append PK keys next
- //
- skc_uint const bits_pk = bits_keys & bits_skpk;
- skc_uint pk = 0;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
- skc_uint is_pk = (bits_pk >> I) & 1; \
- skc_uint pk_idx = skc_ballot(&pk,is_pk); \
- if (is_pk) { \
- smem->lo.pk[pk_idx] = h##I.xk.lo; \
- smem->hi.pk[pk_idx] = h##I.xk.hi; \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
-#if 0
- printf("%2u * %2u\n",sk,pk);
-#endif
- //
- // flush the keys
- //
- skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
- skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
- }
-
- //
- // we're done if there was only a head node
- //
- if (nodes == 0)
- return;
-
- //
- // decrement keys
- //
- keys -= SKC_RASTER_HEAD_COUNT_KEYS;
-
- //
- // otherwise, append keys in trailing nodes to smem
- //
- while (true)
- {
- //
- // load all of the node block ttxk keys into registers
- //
- // FIXME -- this pattern lends itself to using the higher
- // performance Intel GEN block load instructions
- //
- skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- union skc_raster_node_elem const n##I = { \
- .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)], \
- bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)] } \
- };
-
- SKC_PLACE_EXPAND();
-
-#if 0
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \
- nodes,keys, \
- I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \
- n##I.u32v2.hi,n##I.u32v2.lo, \
- n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
-
- SKC_PLACE_EXPAND();
-#endif
-
- //
- // classify every key in the header
- //
- // keys: 0 is not a key / 1 is a key
- // skpk: 0 is sk / 1 is pk
- //
- skc_uint bits_keys = 0;
- skc_uint bits_skpk = 0;
-
- //
- // calculate bits_keys
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) { \
- skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \
- if (idx < keys) { \
- bits_keys |= (1u << I); \
- } \
- if (SKC_PLACE_IS_TRAILING_ROW(I)) { \
- if (keys > SKC_RASTER_NODE_COUNT_KEYS) { \
- if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \
- bits_keys &= ~(1u << I); \
- } \
- } \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
- //
- // blindly calculate bits_skpk
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) { \
- bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
- }
-
- SKC_PLACE_EXPAND();
-
-#if 0
- printf("%2X : %2X\n",bits_keys,bits_skpk);
-#endif
-
- //
- // next pointer is last element of last row. save it now because
- // this might be recognized as a subgroup-uniform/scalar.
- //
- id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
-
- //
- // append SK keys first
- //
- skc_uint const bits_sk = bits_keys & ~bits_skpk;
- skc_uint sk = 0;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) { \
- skc_uint is_sk = (bits_sk >> I) & 1; \
- skc_uint sk_idx = skc_ballot(&sk,is_sk); \
- if (is_sk) { \
- smem->lo.sk[sk_idx] = n##I.xk.lo; \
- smem->hi.sk[sk_idx] = n##I.xk.hi; \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
- //
- // append PK keys next
- //
- skc_uint const bits_pk = bits_keys & bits_skpk;
- skc_uint pk = 0;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) { \
- skc_uint is_pk = (bits_pk >> I) & 1; \
- skc_uint pk_idx = skc_ballot(&pk,is_pk); \
- if (is_pk) { \
- smem->lo.pk[pk_idx] = n##I.xk.lo; \
- smem->hi.pk[pk_idx] = n##I.xk.hi; \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
-#if 0
- printf("%2u * %2u\n",sk,pk);
-#endif
- //
- // if total for either the sk or pk queue reaches the
- // highwater mark then flush it to the extent
- //
- skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
- skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
-
- //
- // if this was the last node then we're done
- //
- if (--nodes == 0)
- return;
-
- //
- // otherwise decrement keys
- //
- keys -= SKC_RASTER_NODE_COUNT_KEYS;
- }
-}
-
-//
-//
-//
+/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "tile.h" +#include "common.h" +#include "raster.h" +#include "atomic_cl.h" +#include "kernel_cl_12.h" + +// +// +// + +#define SKC_PLACE_SUBGROUP_MASK (SKC_PLACE_SUBGROUP_SIZE - 1) +#define SKC_PLACE_SUBGROUP_LAST (SKC_PLACE_SUBGROUP_SIZE - 1) + +// +// +// + +#define SKC_PLACE_SMEM_COUNT_TTSK SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE) +#define SKC_PLACE_SMEM_COUNT_TTPK SKC_RASTER_NODE_MAX_TTPK + +// +// +// + +#define SKC_PLACE_X (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE) + +// +// +// + +#if ( SKC_PLACE_X == 1 ) +#define SKC_PLACE_EXPAND() SKC_EXPAND_1() +#define SKC_PLACE_EXPAND_I_LAST 0 + +#elif ( SKC_PLACE_X == 2 ) +#define SKC_PLACE_EXPAND() SKC_EXPAND_2() +#define SKC_PLACE_EXPAND_I_LAST 1 + +#elif ( SKC_PLACE_X == 4 ) +#define SKC_PLACE_EXPAND() SKC_EXPAND_4() +#define SKC_PLACE_EXPAND_I_LAST 3 + +#elif ( SKC_PLACE_X == 8 ) +#define SKC_PLACE_EXPAND() SKC_EXPAND_8() +#define SKC_PLACE_EXPAND_I_LAST 7 + +#elif ( SKC_PLACE_X == 16) +#define SKC_PLACE_EXPAND() SKC_EXPAND_16() +#define SKC_PLACE_EXPAND_I_LAST 15 +#endif + +// +// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE +// COALESCED WRITES. LO FIRST, FOLLOWED BY HI. +// +// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE +// KERNELS USE DIFFERENT SUBGROUP SIZES. +// +// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE +// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID. +// +// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER +// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY +// ONLY SUPPORT A SUBGROUP SIZE OF 16. +// + +#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE ) + +#define SKC_PLACE_STRIDE_H(L) (L) +#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE) +#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE) + +#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1 + +#define SKC_PLACE_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE) +#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_RATIO - 1) +#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I) ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK)) + +#define SKC_PLACE_STRIDE_H(L) (L) +#define SKC_PLACE_STRIDE_V_LO(I) (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE) +#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE) + +#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1 + +#define SKC_PLACE_SUBGROUP_RATIO (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE) +#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask + +#define SKC_PLACE_STRIDE_H(L) (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK)) +#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE) +#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO) + +#endif + +// +// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE +// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8) +// + +#define SKC_PLACE_IS_ALL_HEADER_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS) + +#define SKC_PLACE_IS_NOT_HEADER_ROW(i) ( (i) * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS) + +#define SKC_PLACE_IS_TRAILING_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS) + +#define SKC_PLACE_IS_HEADER_ROW_KEY(i) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k)) + + +// +// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX +// +#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k)) +#define SKC_PLACE_NODE_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() < (k)) + +// +// TTSK v2: +// +// 0 63 +// | TTSB ID | PREFIX | SPAN | X | Y | +// +---------+--------+---------+-----+-----+ +// | 27 | 1 (=0) | 12 (=0) | 12 | 12 | +// +// +// TTPK v2: +// +// 0 63 +// | TTPB ID | PREFIX | SPAN | X | Y | +// +---------+--------+------+-----+-----+ +// | 27 | 1 (=1) | 12 | 12 | 12 | +// +// + +// +// TTCK (32-BIT COMPARE) v1: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 30 | 1 | 1 | 18 | 7 | 7 | +// +// +// TTCK (32-BIT COMPARE) v2: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 30 | 1 | 1 | 15 | 9 | 8 | +// +// +// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 27 | 1 | 1 | 18 | 9 | 8 | +// + +union skc_subgroup_smem +{ + skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE + + struct { + struct { + skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK]; + skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK]; + } lo; + + struct { + skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK]; + skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK]; + } hi; + + // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK]; + }; + +}; + +// +// scatter scan max +// +static +skc_int_v_t +skc_scatter_scan_max(__local union skc_subgroup_smem volatile * const smem, + skc_int_v_t const iss, + skc_int_v_t const ess) +{ + // + // prefix sums determine which lanes we're going to work on next + // + skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE); + skc_int_v_t const scratch_idx = max(ess,0); + + // + // SIMT + // + + // + // zero the volatile smem scratchpad using vector syntax + // + smem->scratch[get_sub_group_local_id()] = ( 0 ); + + // + // store source lane at starting lane + // + if (is_scratch_store) { + smem->scratch[scratch_idx] = get_sub_group_local_id(); + } + + // + // propagate lanes to right using max scan + // + skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()]; + skc_int_v_t const source = sub_group_scan_inclusive_max(scratch); + + return source; +} + +// +// +// + +static +skc_bool +skc_xk_clip(union skc_tile_clip const * const tile_clip, + skc_ttxk_t * const xk) +{ + // + // clip the sk and pk keys + // + // if fully clipped then return false + // + // alternatively -- we can expand all these keys in place + // + // alternatively -- keep sk and pk keys segregated because sk + // represents the vast majority of keys and are easier to process. + // don't mess with the fastpath! + // + return false; +} + +// +// +// + +static +skc_ttck_t +skc_sk_to_ck(__local union skc_subgroup_smem volatile * const smem, + union skc_cmd_place const * const cmd, + skc_uint const sk_idx) +{ + skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0 + skc_uint const hi = smem->hi.sk[sk_idx]; + + skc_ttck_t ck; + + ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id + + // FIXME -- x and y should already be clipped and shifted + skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X; + skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y; + + ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y; + + return ck; +} + +static +skc_ttck_t +skc_pk_to_ck(__local union skc_subgroup_smem volatile * const smem, + union skc_cmd_place const * const cmd, + skc_uint const pk_idx, + skc_uint const dx) +{ + skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1 + skc_uint const hi = smem->hi.pk[pk_idx]; + + skc_ttck_t ck; + + ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id + + // FIXME -- x and y should already be clipped and shifted + skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X; + skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y; + + ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y; + + return ck; +} + +// +// +// + +static +void +skc_ttsk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics, + __global skc_ttck_t * const ck_extent, + __local union skc_subgroup_smem volatile * const smem, + union skc_cmd_place const * const cmd, + skc_uint const sk) +{ + // + // Pretty sure you can never ever have an sk count equal to 0 + // + skc_uint ck_base = 0; + + // last lane performs the block pool allocation with an atomic increment + if (get_sub_group_local_id() == 0) { + ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk); + } + + // broadcast base to all lanes + ck_base = sub_group_broadcast(ck_base,0); + + // convert sk keys to ck keys + for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE) + { + ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii); + } +} + +// +// +// + +static +skc_int +skc_ttpk_get_span(__local union skc_subgroup_smem volatile * const smem, + skc_uint const idx) +{ + skc_uint const lo = smem->lo.pk[idx]; + skc_uint const hi = smem->hi.pk[idx]; + + skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN; + skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN; + + return (span_lo | span_hi) + 1; +} + +// +// +// + +static +void +skc_ttpk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics, + __global skc_ttck_t * const ck_extent, + __local union skc_subgroup_smem volatile * const smem, + union skc_cmd_place const * const cmd, + skc_uint const pk) +{ + // bail out if pk queue is empty + if (pk == 0) + return; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("%u\n",pk); +#endif + + // + // FIXME -- this nested loop iterates over the queue processing a + // subgroup of 64-bit keys at a time. This is probably not the most + // efficient approach so investigate how to store and iterate over a + // wider than subgroup (node-sized) queue of keys. + // + + // round up so we work with full subgroups + skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK; + skc_uint ii = 0; + + // nested loop that expands all ttpk keys +#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE) + for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE) +#endif + { + skc_uint idx = ii + get_sub_group_local_id(); + skc_int span = 0; + + // how many tiles does this ttpk span? + if (idx < pk) + span = skc_ttpk_get_span(smem,idx); + + // we need inclusive, exclusive and total + skc_int iss = sub_group_scan_inclusive_add(span); + skc_int ess = iss - span; + skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1); + + // printf("%u : %u\n",span,iss); + // continue; + + // atomically allocate space for the pk keys + skc_uint ck_base = 0; + + // last lane performs the block pool allocation with an atomic increment + if (get_sub_group_local_id() == 0) { + ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem); + } + + // broadcast atomically allocated extent base to all lanes + skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id(); + + // + // FIXME -- this loop would probably be faster if the ttpk keys + // were held in registers and accessed with shuffles instead of + // SMEM loads + // + + // + // loop until there are no more expanded pk keys + // + while (true) + { + skc_int const source = skc_scatter_scan_max(smem,iss,ess); + skc_int const dx = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source); + + // store valid ck keys to gmem + if (get_sub_group_local_id() < rem) { + ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx); + } + + // decrement remainder + rem -= SKC_PLACE_SUBGROUP_SIZE; + + if (rem <= 0) + break; + + // increment/decrement indices + ck_idx += SKC_PLACE_SUBGROUP_SIZE; + iss -= SKC_PLACE_SUBGROUP_SIZE; + ess -= SKC_PLACE_SUBGROUP_SIZE; + } + } +} + +// +// +// + +static +skc_uint +skc_ballot(skc_uint * const xk, skc_uint const is_xk) +{ +#if 0 + // + // FIXME -- when available, this should use the idiom: + // + // ballot() + lane_mask_less_than_or_equal + popcount() + // + // Supported by: + // + // - Vulkan 1.1 / SPIR-V 1.3 + // - CUDA + // - AVX2 (SSE*?) + // +#else + // + // otherwise, emulate with an inclusive scan (yuk) + // + skc_uint const prefix = sub_group_scan_inclusive_add(is_xk); + + skc_uint const xk_idx = *xk + prefix - is_xk; + + *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST); + +#if 0 + printf("< %3u >\n",xk_idx); +#endif + + return xk_idx; +#endif +} + +// +// +// +__kernel +SKC_PLACE_KERNEL_ATTRIBS +void +skc_kernel_place(__global skc_bp_elem_t * const bp_elems, + __global SKC_ATOMIC_UINT volatile * const place_atomics, + __global skc_ttck_t * const ck_extent, + __global union skc_cmd_place const * const cmds, + __global skc_block_id_t * const map, + skc_uint4 const clip, + skc_uint const count) +{ + // + // declare shared memory block + // +#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 ) + __local union skc_subgroup_smem volatile smem[1]; +#else + __local union skc_subgroup_smem volatile smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS]; + __local union skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); +#endif + + // + // This is a subgroup-centric kernel + // + // Which subgroup in the grid is this? + // + // TAKE NOTE: the Intel GEN compiler appears to be recognizing + // get_group_id(0) as a uniform but the alternative calculation used + // when there are multiple subgroups per workgroup is not + // cooperating and driving spillage elsewhere. + // + // Test the raster's translated bounds against the composition's + // tile clip + // + // There are 3 cases: + // + // - the raster is completely clipped -> return + // - the raster is partially clipped -> all keys must clipped + // - the raster is not clipped -> no keys are tested + // + // + // There are at least 4 implementations of place and we want to + // special-case them as much as possible so that, at the least, the + // fastpath remains fast. + // + // - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP + // + // - implement CLIPPED + NO TRANSLATION path + // + // - implement NO CLIP + TRANSLATION path + // + // - implement CLIPPED + TRANSLATION path + // + // + // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin + // 12:12:8 integer where: + // + // 12: ttsk + // 12: ttpk + // 8: /dev/null -- clipped or invalid key + // + // Three kinds of nodes in a raster's list: + // + // - the head node + // - an internal node + // - the final node + // + +#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 ) + skc_uint const cmd_idx = get_group_id(0); +#else + skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id(); +#endif + + // load command + union skc_cmd_place const cmd = cmds[cmd_idx]; + + // get the raster header from the raster host id -- scalar + skc_block_id_t id = map[cmd.raster_h]; + + // + // load all of the head block ttxk keys into registers + // + // FIXME -- this pattern lends itself to using the higher + // performance Intel GEN block load instructions + // + skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id()); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + union skc_raster_node_elem const h##I = { \ + .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)], \ + bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)] } \ + }; + + SKC_PLACE_EXPAND(); + + // + // load raster header counts -- we only need the "nodes" and "keys" + // words but the keys we loaded are doublewords. + // + // FIXME -- this can be made portable with compile-time macro expansion + // + skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES + skc_uint keys = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS + + // + // + // +#if 0 +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \ + nodes,keys, \ + I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \ + h##I.u32v2.hi,h##I.u32v2.lo, \ + h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX); + + SKC_PLACE_EXPAND(); +#endif + + // +#if 0 + if (get_sub_group_local_id() == 0) { + printf("place: %u / %u / %u\n",head_id,nodes,keys); + } +#endif + + { + // + // classify every key in the header + // + // keys: 0 is not a key / 1 is a key + // skpk: 0 is sk / 1 is pk + // + skc_uint bits_keys = 0; + skc_uint bits_skpk = 0; + + // + // calculate bits_keys + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ + skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \ + if (idx < keys) { \ + bits_keys |= (1u << I); \ + } \ + if (SKC_PLACE_IS_TRAILING_ROW(I)) { \ + if (keys > SKC_RASTER_HEAD_COUNT_KEYS) { \ + if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \ + bits_keys &= ~(1u << I); \ + } \ + } \ + } \ + } + + SKC_PLACE_EXPAND(); + + // + // blindly calculate bits_skpk + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ + bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \ + } + + SKC_PLACE_EXPAND(); + +#if 0 + printf("%2X : %2X\n",bits_keys,bits_skpk); +#endif + + // + // next pointer is last element of last row. save it now because + // this might be recognized as a subgroup-uniform/scalar. + // + id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST); + + // + // append SK keys first + // + skc_uint const bits_sk = bits_keys & ~bits_skpk; + skc_uint sk = 0; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ + skc_uint is_sk = (bits_sk >> I) & 1; \ + skc_uint sk_idx = skc_ballot(&sk,is_sk); \ + if (is_sk) { \ + smem->lo.sk[sk_idx] = h##I.xk.lo; \ + smem->hi.sk[sk_idx] = h##I.xk.hi; \ + } \ + } + + SKC_PLACE_EXPAND(); + + // + // append PK keys next + // + skc_uint const bits_pk = bits_keys & bits_skpk; + skc_uint pk = 0; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ + skc_uint is_pk = (bits_pk >> I) & 1; \ + skc_uint pk_idx = skc_ballot(&pk,is_pk); \ + if (is_pk) { \ + smem->lo.pk[pk_idx] = h##I.xk.lo; \ + smem->hi.pk[pk_idx] = h##I.xk.hi; \ + } \ + } + + SKC_PLACE_EXPAND(); + +#if 0 + printf("%2u * %2u\n",sk,pk); +#endif + // + // flush the keys + // + skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk); + skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk); + } + + // + // we're done if there was only a head node + // + if (nodes == 0) + return; + + // + // decrement keys + // + keys -= SKC_RASTER_HEAD_COUNT_KEYS; + + // + // otherwise, append keys in trailing nodes to smem + // + while (true) + { + // + // load all of the node block ttxk keys into registers + // + // FIXME -- this pattern lends itself to using the higher + // performance Intel GEN block load instructions + // + skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id()); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + union skc_raster_node_elem const n##I = { \ + .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)], \ + bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)] } \ + }; + + SKC_PLACE_EXPAND(); + +#if 0 +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \ + nodes,keys, \ + I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \ + n##I.u32v2.hi,n##I.u32v2.lo, \ + n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX); + + SKC_PLACE_EXPAND(); +#endif + + // + // classify every key in the header + // + // keys: 0 is not a key / 1 is a key + // skpk: 0 is sk / 1 is pk + // + skc_uint bits_keys = 0; + skc_uint bits_skpk = 0; + + // + // calculate bits_keys + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) { \ + skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \ + if (idx < keys) { \ + bits_keys |= (1u << I); \ + } \ + if (SKC_PLACE_IS_TRAILING_ROW(I)) { \ + if (keys > SKC_RASTER_NODE_COUNT_KEYS) { \ + if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \ + bits_keys &= ~(1u << I); \ + } \ + } \ + } \ + } + + SKC_PLACE_EXPAND(); + + // + // blindly calculate bits_skpk + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) { \ + bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \ + } + + SKC_PLACE_EXPAND(); + +#if 0 + printf("%2X : %2X\n",bits_keys,bits_skpk); +#endif + + // + // next pointer is last element of last row. save it now because + // this might be recognized as a subgroup-uniform/scalar. + // + id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST); + + // + // append SK keys first + // + skc_uint const bits_sk = bits_keys & ~bits_skpk; + skc_uint sk = 0; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) { \ + skc_uint is_sk = (bits_sk >> I) & 1; \ + skc_uint sk_idx = skc_ballot(&sk,is_sk); \ + if (is_sk) { \ + smem->lo.sk[sk_idx] = n##I.xk.lo; \ + smem->hi.sk[sk_idx] = n##I.xk.hi; \ + } \ + } + + SKC_PLACE_EXPAND(); + + // + // append PK keys next + // + skc_uint const bits_pk = bits_keys & bits_skpk; + skc_uint pk = 0; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) { \ + skc_uint is_pk = (bits_pk >> I) & 1; \ + skc_uint pk_idx = skc_ballot(&pk,is_pk); \ + if (is_pk) { \ + smem->lo.pk[pk_idx] = n##I.xk.lo; \ + smem->hi.pk[pk_idx] = n##I.xk.hi; \ + } \ + } + + SKC_PLACE_EXPAND(); + +#if 0 + printf("%2u * %2u\n",sk,pk); +#endif + // + // if total for either the sk or pk queue reaches the + // highwater mark then flush it to the extent + // + skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk); + skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk); + + // + // if this was the last node then we're done + // + if (--nodes == 0) + return; + + // + // otherwise decrement keys + // + keys -= SKC_RASTER_NODE_COUNT_KEYS; + } +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/prefix.cl b/src/compute/skc/platforms/cl_12/kernels/prefix.cl index 21a51694da..ae3397c26d 100644 --- a/src/compute/skc/platforms/cl_12/kernels/prefix.cl +++ b/src/compute/skc/platforms/cl_12/kernels/prefix.cl @@ -1,1041 +1,1041 @@ -/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "tile.h"
-#include "block.h"
-#include "raster.h"
-#include "atomic_cl.h"
-#include "raster_builder_cl_12.h"
-#include "device_cl_12.h"
-
-//
-// INPUT:
-//
-// TTRK (64-BIT COMPARE)
-//
-// 0 63
-// | TTSB ID | X | Y | COHORT ID |
-// +---------+------+------+-----------+
-// | 27 | 12 | 12 | 13 |
-//
-//
-// TTRK (32-BIT COMPARE)
-//
-// 0 63
-// | TTSB ID | N/A | X | Y | COHORT ID |
-// +---------+-----+------+------+-----------+
-// | 27 | 5 | 12 | 12 | 8 |
-//
-//
-// OUTPUT:
-//
-// TTSK v2:
-//
-// 0 63
-// | TTSB ID | PREFIX | N/A | X | Y |
-// +---------+--------+------+----+----+
-// | 27 | 1 (=0) | 12 | 12 | 12 |
-//
-//
-// TTPK v1:
-//
-// 0 63
-// | TTPB ID | ALL ZEROES | SPAN | X | Y |
-// +---------+------------+------+-----+-----+
-// | 27 | 1 | 12 | 12 | 12 |
-//
-//
-// TTPK v2:
-//
-// 0 63
-// | TTPB ID | PREFIX | SPAN | X | Y |
-// +---------+--------+------+-----+-----+
-// | 27 | 1 (=1) | 12 | 12 | 12 |
-//
-
-#define SKC_PREFIX_SUBGROUP_MASK (SKC_PREFIX_SUBGROUP_SIZE - 1)
-
-//
-// smem accumulator
-//
-
-union skc_subgroup_accum
-{
- struct {
- SKC_ATOMIC_INT ttp[SKC_TILE_HEIGHT];
- } atomic;
-
- struct {
- skc_ttp_t ttp[SKC_TILE_HEIGHT];
- } aN;
-
- struct {
- SKC_PREFIX_TTP_V ttp[SKC_PREFIX_SUBGROUP_SIZE];
- } vN;
-
- struct {
- SKC_PREFIX_SMEM_ZERO ttp[SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH];
- } zero;
-};
-
-//
-//
-//
-
-struct skc_subgroup_smem
-{
- // prefix accumulator
- union skc_subgroup_accum accum;
-};
-
-//
-//
-//
-
-static
-skc_uint
-skc_subgroup_lane()
-{
-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
- return get_sub_group_local_id();
-#else
- return 0;
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_PREFIX_TTS_V_BITFIELD
-skc_tts_get_dy(skc_tts_v_t const ttsv)
-{
- // tts.dy is packed to fit in range [-32,31] and unpacked to [-32..-1,+1..+32]
- SKC_PREFIX_TTS_V_BITFIELD const dy = ttsv >> SKC_TTS_OFFSET_DY;
-
- return dy - (~ttsv >> 31);
-}
-
-static
-SKC_PREFIX_TTS_V_BITFIELD
-skc_tts_get_py(skc_tts_v_t const ttsv)
-{
- return SKC_BFE(ttsv,SKC_TTS_BITS_TY-SKC_SUBPIXEL_RESL_Y_LOG2,SKC_TTS_OFFSET_TY+SKC_SUBPIXEL_RESL_Y_LOG2);
-}
-
-//
-//
-//
-
-static
-void
-skc_accum_scatter(__local struct skc_subgroup_smem * const smem, skc_tts_v_t const tts_v)
-{
- // get "altitude"
- SKC_PREFIX_TTS_V_BITFIELD dy = skc_tts_get_dy(tts_v);
-
- // get the y pixel coordinate
- SKC_PREFIX_TTS_V_BITFIELD py = skc_tts_get_py(tts_v);
-
- //
- // FIXME -- benchmark performance of setting dy to 0 if tts_v is invalid?
- //
- // FIXME -- consider making TTS_INVALID a dy/py/etc. that's a no-op
- //
-
-#if 0
- if (tts_v != SKC_TTS_INVALID)
- printf("< %08X = %u : %d >\n",tts_v,py,dy);
-#endif
-
- //
- // scatter-add the "altitude" to accumulator
- //
-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
- //
- // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) \
- if (tts_v C != SKC_TTS_INVALID) { \
- SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->accum.atomic.ttp + py C, dy C); \
- }
-
-#else
- //
- // CPU/SIMD -- ITERATE OVER VECTOR, NO NEED FOR ATOMICS
- //
- // WITH SIMD, ONCE A TTS_INVALID IS DETECTED WE CAN QUIT
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) \
- if (tts_v C == SKC_TTS_INVALID) \
- return; \
- smem->accum.aN.ttp[py C] = dy C;
-#endif
-
- SKC_PREFIX_TTS_VECTOR_INT_EXPAND();
-}
-
-//
-// The implication here is that if our device configuration has a
-// rectangular 1:2 tile then we need a block size of at least 2
-// subblocks. The subblock size of course needs to match the length of
-// the smallest tile side.
-//
-
-static
-void
-skc_accum_flush(__local struct skc_subgroup_smem * const smem,
- __global skc_bp_elem_t * const bp_elems,
- skc_block_id_t const pb_id)
-{
- // load the ttp elements
- SKC_PREFIX_TTP_V const ttp_v = smem->accum.vN.ttp[get_sub_group_local_id()];
- skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();
-
-#if ( SKC_TILE_RATIO == 1 )
-
- bp_elems[offset] = ttp_v;
-
-#elif ( SKC_TILE_RATIO == 2 )
-
- vstore2(ttp_v,offset,bp_elems);
-
-#else
-
-#error("tile ratio greater than 2 not supported")
-
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_accum_reset(__local struct skc_subgroup_smem * const smem)
-{
- for (uint ii=0; ii<SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH / SKC_PREFIX_SUBGROUP_SIZE; ii++)
- smem->accum.zero.ttp[ii * SKC_PREFIX_SUBGROUP_SIZE + skc_subgroup_lane()] = ( 0 );
-}
-
-//
-// get next sk key
-//
-
-static
-skc_ttsk_s_t
-skc_ttsk_v_get_next(skc_ttsk_v_t * const sk_v,
- skc_uint * const sk_next,
- skc_int * const rkpk_rem)
-{
- // decrement count
- *rkpk_rem -= 1;
-
-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
- //
- // SIMT with subgroup support is easy
- //
- // SIMT without subgroup support can always emulate with smem
- //
-#if 0
- //
- // BUG TICKLED BY FILTHY CODE -- Intel compiler doesn't properly
- // broadcast a uint2 cast to a long. It was probably bad to do this
- // anyway without a union wrapping the TTSK scalar type.
- //
- // Consider creating a union { ulong; uint2 } at a later date --
- // probably no need to ever do this unless it makes broadcast faster
- // which is unlikely since it will probably be implemented as 2
- // 32-bit broadcasts.
- //
- // Additionally, the TTRK and TTXK key bitfield sizes are probably
- // cast in stone and we aren't going to change them no matter
- // architecture we're on.
- //
- skc_ttsk_s_t sk_s = sub_group_broadcast(SKC_AS(ulong)(*sk_v),(*sk_next)++);
-#else
- skc_ttsk_s_t sk_s;
-
- sk_s.lo = sub_group_broadcast(sk_v->lo,*sk_next);
- sk_s.hi = sub_group_broadcast(sk_v->hi,*sk_next);
- *sk_next += 1;
-#endif
-
-#else
- //
- // SIMD will always grab component .s0 and then rotate the vector
- //
- sk_s = ( sk_v->s0 );
-
- skc_ttsk_v_rotate_down(sk_v);
-
-#endif
-
- return sk_s;
-}
-
-//
-//
-//
-
-static
-skc_raster_yx_s
-skc_ttsk_v_first(skc_ttsk_v_t * const sk_v, skc_uint const sk_next)
-{
-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
- //
- // SIMT with subgroup support is easy
- //
- // SIMT without subgroup support can always emulate with smem
- //
- skc_raster_yx_s const yx_s = sub_group_broadcast(sk_v->hi,sk_next);
-
-#else
- //
- // SIMD will always grab component .s0 and then rotate the vector
- //
- skc_raster_yx_s const yx_s = ( sk_v->s0.hi );
-
-#endif
-
- return yx_s;
-}
-
-//
-// mask off ttsb id
-//
-
-static
-skc_block_id_s_t
-skc_ttsk_s_get_ttsb_id(skc_ttsk_s_t const * const sk_s)
-{
- return ( sk_s->lo & SKC_TTXK_LO_MASK_ID );
-}
-
-//
-// load tts_v as early as possible
-//
-
-static
-skc_tts_v_t
-skc_load_tts(__global skc_bp_elem_t * const bp_elems,
- skc_block_id_s_t const sb_id)
-{
- return ( bp_elems[sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()] );
-}
-
-//
-// massage ttrk keys into ttsk keys
-//
-
-static
-void
-skc_ttrk_to_ttsk(skc_ttsk_v_t * const sk_v)
-{
- sk_v->lo = sk_v->lo & SKC_TTXK_LO_MASK_ID; // clear high (N/A) bits
- sk_v->hi = sk_v->hi << SKC_TTRK_HI_BITS_COHORT; // shift cohort away -- zeroes low bits
-}
-
-//
-// replenish ttsk keys
-//
-
-static
-void
-skc_ttsk_v_replenish(skc_ttsk_v_t * const sk_v,
- skc_uint * const sk_next,
- skc_uint * const rks_next,
- __global skc_ttrk_e_t const * const rks)
-{
- // if there are still keys available then return
- if (*sk_next < SKC_PREFIX_TTXK_V_SIZE)
- return;
-
- //
- // otherwise, replenish sk_v
- //
- // NOTE NOTE NOTE -- we are assuming rks[] extent size is always
- // divisible by TTXK_V_SIZE and therefore loading some keys from the
- // next raster is OK.
- //
- *sk_next = 0;
- *rks_next += SKC_PREFIX_SUBGROUP_SIZE;
- *sk_v = rks[*rks_next];
-
-#if 0
- printf("* %08X ( %3u, %3u )\n",
- sk_v->hi,
- (sk_v->hi >> 12) & 0xFFF,
- (sk_v->hi ) & 0xFFF);
-#endif
-
- skc_ttrk_to_ttsk(sk_v);
-
-#if 0
- printf("! %08X ( %3u, %3u )\n",
- sk_v->hi,
- (sk_v->hi >> 20) & 0xFFF,
- (sk_v->hi >> 8) & 0xFFF);
-#endif
-}
-
-//
-// replenish block ids
-//
-// note that you can't overrun the block id pool since it's a ring
-//
-
-static
-void
-skc_blocks_replenish(skc_uint * const blocks_next,
- skc_uint * const blocks_idx,
- skc_block_id_v_t * const blocks,
- skc_uint const bp_mask, // pow2 modulo mask for block pool ring
- __global skc_block_id_t const * const bp_ids)
-
-{
- *blocks_idx += SKC_PREFIX_BLOCK_ID_V_SIZE;
- *blocks = bp_ids[*blocks_idx & bp_mask];
- *blocks_next = 0;
-
-#if 0
- printf("replenish blocks: %u\n",*blocks);
-#endif
-}
-
-//
-//
-//
-
-static
-skc_block_id_t
-skc_blocks_get_next(skc_uint * const blocks_next,
- skc_uint * const blocks_idx,
- skc_block_id_v_t * const blocks,
- skc_uint const bp_mask, // pow2 modulo mask for block pool ring
- __global skc_block_id_t const * const bp_ids)
-{
- // replenish?
- if (*blocks_next == SKC_PREFIX_BLOCK_ID_V_SIZE)
- {
- skc_blocks_replenish(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
- }
-
-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
- //
- // SIMT
- //
- skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);
-
-#else
- //
- // SIMD
- //
- skc_block_id_t id = blocks->s0;
-
- skc_shuffle_down_1(*blocks);
-
-#endif
-
- *blocks_next += 1;
-
- return id;
-}
-
-//
-// subblock allocator
-//
-
-#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 )
-
-static
-skc_block_id_t
-skc_subblocks_get_next_pb_id(skc_block_id_t * const subblocks,
- skc_uint * const blocks_next,
- skc_uint * const blocks_idx,
- skc_block_id_v_t * const blocks,
- skc_uint const bp_mask, // pow2 modulo mask for block pool ring
- __global skc_block_id_t const * const bp_ids)
-{
- if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
- {
- *subblocks = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
- }
-
- skc_block_id_t const pb_id = *subblocks;
-
- *subblocks += SKC_TILE_RATIO; // note this is one or two subblocks
-
- return pb_id;
-}
-
-#endif
-
-//
-// append a ttsk key to the work-in-progress node
-//
-
-static
-void
-skc_node_v_append_sk(skc_ttsk_s_t const * const sk_s,
-
- skc_ttxk_v_t * const xk_v,
- skc_uint * const xk_v_next,
- skc_uint * const xk_v_idx,
- __global skc_bp_elem_t * const bp_elems,
-
- skc_int const rkpk_rem,
-
- skc_uint * const blocks_next,
- skc_uint * const blocks_idx,
- skc_block_id_v_t * const blocks,
- skc_uint const bp_mask,
- __global skc_block_id_t const * const bp_ids)
-{
- //
- // Append an sk key to the in-register xk_v vector
- //
- // If the work-in-progress node in gmem will only have room for one
- // more key then:
- //
- // - if this was the final SK then write out xk_v and exit
- //
- // - otherwise, acquire a block id, link it, write out xk_v,
- // prepare new node
- //
- // Note that this does *not* try to squeeze in a final key into the
- // next node slot. This optimization isn't worth the added
- // down-pipeline complexity.
- //
-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
- //
- // SIMT
- //
- if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK))
- {
- *xk_v = *sk_s;
- }
-
- *xk_v_next += 1;
-
- // are there more keys coming?
- if (rkpk_rem > 0)
- {
- // is the node almost full?
- if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1)
- {
- skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
-
- if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1)
- {
- xk_v->lo = id;
- xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary
- }
-
- // store xk_v (uint2) to bp (uint)
- bp_elems[*xk_v_idx ] = xk_v->lo;
- bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
-#if 0
- printf("S) %u : %08v2X\n",*xk_v_idx,*xk_v);
-#endif
- // reinitialize xk_v
- xk_v->lo = SKC_UINT_MAX;
- xk_v->hi = SKC_UINT_MAX;
-
- // update node elem idx
- *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
-
- // reset node count
- *xk_v_next = 0;
- }
- // is xk_v full?
- else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0)
- {
- // store xk_v to bp
- bp_elems[*xk_v_idx ] = xk_v->lo;
- bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
-#if 0
- printf("s) %u : %08v2X\n",*xk_v_idx,*xk_v);
-#endif
- // reinitialize xk_v
- xk_v->lo = SKC_UINT_MAX;
- xk_v->hi = SKC_UINT_MAX;
-
- // increment node elem idx
- *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
- }
- }
- else
- {
- bp_elems[*xk_v_idx ] = xk_v->lo;
- bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
-#if 0
- printf("z) %u : %08v2X\n",*xk_v_idx,*xk_v);
-#endif
- while ((*xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2)
- {
- *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
-
- bp_elems[*xk_v_idx] = SKC_UINT_MAX;
- bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX;
- }
- }
-
-#else
- //
- // SIMD
- //
-
-#endif
-}
-
-//
-//
-//
-
-static
-skc_ttpk_s_t
-skc_ttpk_create(skc_raster_yx_s const yx_prev,
- skc_raster_yx_s const yx_next,
- skc_block_id_t const pb_id)
-{
- // - yx_prev is already incremented by one
- // - yx_span is already shifted up at hi.x
- skc_uint const yx_span = yx_next - yx_prev;
-
- skc_ttpk_s_t pk;
-
- // turn on prefix bit | shift span bits upward
- pk.lo = pb_id | SKC_TTXK_LO_MASK_PREFIX | (yx_span << SKC_TTPK_LO_SHL_YX_SPAN);
-
- // shift down high span bits | yx of tile
- pk.hi = (yx_span >> SKC_TTPK_HI_SHR_YX_SPAN) | yx_prev;
-
-#if 0
- if (get_sub_group_local_id() == 0)
- printf("* %08v2X : %u\n",pk,yx_span);
-#endif
-
- return pk;
-}
-
-//
-// append a ttpk key to the work-in-progress node
-//
-
-static
-void
-skc_node_v_append_pk(skc_ttpk_s_t const * const pk_s,
-
- skc_ttxk_v_t * const xk_v,
- skc_uint * const xk_v_next,
- skc_uint * const xk_v_idx,
- __global skc_bp_elem_t * const bp_elems,
-
- skc_uint * const blocks_next,
- skc_uint * const blocks_idx,
- skc_block_id_v_t * const blocks,
- skc_uint const bp_mask,
- __global skc_block_id_t const * const bp_ids)
-{
- //
- // append a pk key to the in-register xk_v vector
- //
- // if the work-in-progress node in gmem will only have room for one
- // more key then:
- //
- // - if this was the final SK then write out xk_v and exit
- //
- // - otherwise, acquire a block id, link it, write out xk_v,
- // prepare new node
- //
-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
- //
- // SIMT
- //
- if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK))
- {
- *xk_v = *pk_s;
- }
-
- *xk_v_next += 1;
-
- // is the node almost full?
- if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1)
- {
- skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
-
- if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1)
- {
- xk_v->lo = id;
- xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary
- }
-
- // store xk_v to bp
- bp_elems[*xk_v_idx ] = xk_v->lo;
- bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
-#if 0
- printf("P) %u : %08v2X\n",*xk_v_idx,*xk_v);
-#endif
- // reinitialize xk_v
- xk_v->lo = SKC_UINT_MAX;
- xk_v->hi = SKC_UINT_MAX;
-
- // update node elem idx
- *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
-
- // reset node count
- *xk_v_next = 0;
- }
- // is xk_v full?
- else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0)
- {
- // store xk_v to bp
- bp_elems[*xk_v_idx ] = xk_v->lo;
- bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
-#if 0
- printf("p) %u : %08v2X\n",*xk_v_idx,*xk_v);
-#endif
- // reinitialize xk_v
- xk_v->lo = SKC_UINT_MAX;
- xk_v->hi = SKC_UINT_MAX;
-
- // increment node elem idx
- *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
- }
-
-#else
- //
- // SIMD
- //
-#endif
-}
-
-//
-// append the first 3 fields of meta info to the raster header
-//
-
-static
-void
-skc_node_v_init_header(skc_ttxk_v_t * const xk_v,
- skc_uint * const xk_v_next,
- union skc_raster_cohort_meta_out const * const meta)
-{
-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
- //
- // SIMT
- //
- if (get_sub_group_local_id() < 2)
- {
- *xk_v = ((get_sub_group_local_id() & 1) == 0) ? meta->u32v4.lo : meta->u32v4.hi;
- }
-
-#if 0
- if (get_sub_group_local_id() == 0)
- printf("header: %08v4X\n",meta->u32v4);
-#endif
-
- //
- // increment counter: uint4 + uint4 = uint2 x 4
- //
- *xk_v_next = 2 + 2; // +2 for unitialized bounds
-
-#else
- //
- // SIMD
- //
-
-#endif
-}
-
-//
-//
-//
-
-__kernel
-SKC_PREFIX_KERNEL_ATTRIBS
-void
-skc_kernel_prefix(__global skc_uint const * const bp_atomics,
- __global skc_block_id_t const * const bp_ids,
- __global skc_bp_elem_t * const bp_elems,
- skc_uint const bp_mask, // pow2 modulo mask for block pool ring
- __global skc_ttrk_e_t const * const rks,
- __global skc_block_id_t * const map,
- __global skc_uint const * const metas,
- skc_uint const count)
-{
- //
- // declare shared memory block
- //
-#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 )
- __local struct skc_subgroup_smem smem[1];
-#else
- __local struct skc_subgroup_smem smems[SKC_PREFIX_WORKGROUP_SUBGROUPS];
- __local struct skc_subgroup_smem * restrict const smem = smems + get_sub_group_id();
-#endif
-
- //
- // where is this subgroup in the grid?
- //
-#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 )
- skc_uint const sgi = get_group_id(0);
-#else
- skc_uint const sgi = get_group_id(0) * SKC_PREFIX_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
- skc_uint const sgl = get_sub_group_local_id();
-
- //
- // return if this subgroup is excess
- //
-#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS > 1 )
- if (sgi >= count)
- return;
-#endif
-
- //
- // get meta info for this subgroup's raster
- //
- union skc_raster_cohort_meta_out const meta = { vload4(sgi,metas) };
- skc_uint const reads = metas[SKC_RASTER_COHORT_META_OFFSET_READS + sgi];
-
-#if 0
- if (get_sub_group_local_id() == 0)
- printf("%3u : %5u / %5u / %5u / %5u / %u\n",
- sgi,
- meta.blocks,
- meta.offset,
- meta.nodes,
- meta.keys,
- reads);
-#endif
-
- //
- // preload blocks -- align on subgroup
- //
- skc_uint blocks_idx = (reads & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane();
- skc_block_id_v_t blocks = bp_ids[blocks_idx & bp_mask];
- skc_uint blocks_next = (reads & SKC_PREFIX_SUBGROUP_MASK);
-
- //
- // prime xk_v_idx with a block but note that OpenCL vstore_n() will scale the offset
- //
- skc_uint xk_v_idx = sub_group_broadcast(blocks,blocks_next++) * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
-
- //
- // initialize raster header -- assumes block is greater than 8 words (4 doublewords)
- //
- skc_ttxk_v_t xk_v = { SKC_UINT_MAX, SKC_UINT_MAX };
- skc_uint xk_v_next;
-
- skc_node_v_init_header(&xk_v,&xk_v_next,&meta);
-
- //
- // no keys -- this is an empty raster!
- //
- if (meta.keys == 0)
- {
- bp_elems[xk_v_idx ] = xk_v.lo;
- bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v.hi;
-
- while ((xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2)
- {
- xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
-
- bp_elems[xk_v_idx] = SKC_UINT_MAX;
- bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX;
- }
-
- return;
- }
-
- //
- // load TTRK keys and in-place convert to TTSK keys
- //
- skc_uint rks_next = (meta.offset & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane();
- skc_ttsk_v_t sk_v = rks[rks_next];
- skc_uint sk_next = (meta.offset & SKC_PREFIX_SUBGROUP_MASK);
- skc_int rkpk_rem = meta.keys; // signed count of remaining rk+pk keys
-
-#if 0
- printf("* %08X ( %3u, %3u )\n",
- sk_v.hi,
- (sk_v.hi >> 12) & 0xFFF,
- (sk_v.hi ) & 0xFFF);
-#endif
-
- skc_ttrk_to_ttsk(&sk_v);
-
-#if 0
- printf("! %08X ( %3u, %3u )\n",
- sk_v.hi,
- (sk_v.hi >> 20) & 0xFFF,
- (sk_v.hi >> 8) & 0xFFF);
-#endif
-
- //
- // subblocks
- //
-#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 )
- skc_block_id_t subblocks = 0;
-#endif
-
- //
- // begin "scan" of tiles
- //
- skc_raster_yx_s yx_prev = skc_ttsk_v_first(&sk_v,sk_next);
-
- //
- // zero the accumulator
- //
- skc_accum_reset(smem);
-
- while (true)
- {
- // get next rk key
- skc_ttsk_s_t const sk_s = skc_ttsk_v_get_next(&sk_v,&sk_next,&rkpk_rem);
-
- // load ttsb id
- skc_block_id_s_t const sb_id = skc_ttsk_s_get_ttsb_id(&sk_s);
-
- // load tts_v transaction "in flight" as early as possible
- skc_tts_v_t const tts_v = skc_load_tts(bp_elems,sb_id);
-
-#if 0
- printf("{ %08X }\n",tts_v);
-#endif
-
-#if 0
- if (get_sub_group_local_id() == 0)
- printf("[ %d, %X ]\n",rkpk_rem,sb_id);
-#endif
-
-#if 0
- if (get_sub_group_local_id() == 0)
- printf("@ %08X ( %3u, %3u )\n",sk_s.hi,(sk_s.hi >> 20),(sk_s.hi >> 8) & 0xFFF);
-#endif
-
- //
- // FIXME -- SOME OF THESE COMPARISONS CAN BE PERFORMED AHEAD OF
- // TIME AND SIMD'IZED
- //
-
- // if yx's don't match then we're either issuing a ttpk or
- // resetting the accumulator
- if (sk_s.hi != yx_prev)
- {
- // if yx_next.y == yx_last.y then x changed
- if (((sk_s.hi ^ yx_prev) & SKC_TTXK_HI_MASK_Y) == 0)
- {
- //
- // if the tile is not square then it's ratio is 1:2
- //
-#if SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2
- skc_block_id_t const pb_id = skc_subblocks_get_next_pb_id(&subblocks,
- &blocks_next,
- &blocks_idx,
- &blocks,
- bp_mask,
- bp_ids);
-#else
- skc_block_id_t const pb_id = skc_blocks_get_next(&blocks_next,
- &blocks_idx,
- &blocks,
- bp_mask,
- bp_ids);
-#endif
-
- // flush accumulated ttp vector to block/subblock at ttpb_id
- skc_accum_flush(smem,bp_elems,pb_id);
-
-#if 0
- if (get_sub_group_local_id() == 0)
- {
- printf("%8u : ( %4u, %4u ) -> ( %4u, %4u )\n",
- pb_id,
- (yx_prev >> SKC_TTXK_HI_OFFSET_Y),
- (yx_prev >> SKC_TTXK_HI_OFFSET_X) & 0xFFF,
- (sk_s.hi >> SKC_TTXK_HI_OFFSET_Y) & 0xFFF,
- (sk_s.hi >> SKC_TTXK_HI_OFFSET_X) & 0xFFF);
- }
-#endif
-
- //
- // FIXME -- A SIMD-WIDE BLOCK OF TTPK KEYS CAN BE CREATED IN ONE STEP
- //
- rkpk_rem -= 1;
-
- // create the pk
- skc_ttpk_s_t const pk_s = skc_ttpk_create(yx_prev+SKC_TTXK_HI_ONE_X,sk_s.hi,pb_id);
-
- // append pk key to xk buffer
- skc_node_v_append_pk(&pk_s,
-
- &xk_v,
- &xk_v_next,
- &xk_v_idx,
- bp_elems,
-
- &blocks_next,
- &blocks_idx,
- &blocks,
- bp_mask,
- bp_ids);
- }
- else if (rkpk_rem > 0) // we're starting a new tile row
- {
- skc_accum_reset(smem);
- }
- }
-
- //
- // append sk key to node_v
- //
- // if rkpk_rem is zero then return from kernel
- //
- skc_node_v_append_sk(&sk_s,
-
- &xk_v,
- &xk_v_next,
- &xk_v_idx,
- bp_elems,
-
- rkpk_rem,
-
- &blocks_next,
- &blocks_idx,
- &blocks,
- bp_mask,
- bp_ids);
-
- // we're done if no more sk keys
- if (rkpk_rem == 0)
- break;
-
- // move to new tile
- yx_prev = sk_s.hi;
-
- // scatter tts values into accumulator
- skc_accum_scatter(smem,tts_v);
-
- // replenish sk keys
- skc_ttsk_v_replenish(&sk_v,&sk_next,&rks_next,rks);
- }
-}
-
-//
-//
-//
+/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "tile.h" +#include "block.h" +#include "raster.h" +#include "atomic_cl.h" +#include "raster_builder_cl_12.h" +#include "kernel_cl_12.h" + +// +// INPUT: +// +// TTRK (64-BIT COMPARE) +// +// 0 63 +// | TTSB ID | X | Y | COHORT ID | +// +---------+------+------+-----------+ +// | 27 | 12 | 12 | 13 | +// +// +// TTRK (32-BIT COMPARE) +// +// 0 63 +// | TTSB ID | N/A | X | Y | COHORT ID | +// +---------+-----+------+------+-----------+ +// | 27 | 5 | 12 | 12 | 8 | +// +// +// OUTPUT: +// +// TTSK v2: +// +// 0 63 +// | TTSB ID | PREFIX | N/A | X | Y | +// +---------+--------+------+----+----+ +// | 27 | 1 (=0) | 12 | 12 | 12 | +// +// +// TTPK v1: +// +// 0 63 +// | TTPB ID | ALL ZEROES | SPAN | X | Y | +// +---------+------------+------+-----+-----+ +// | 27 | 1 | 12 | 12 | 12 | +// +// +// TTPK v2: +// +// 0 63 +// | TTPB ID | PREFIX | SPAN | X | Y | +// +---------+--------+------+-----+-----+ +// | 27 | 1 (=1) | 12 | 12 | 12 | +// + +#define SKC_PREFIX_SUBGROUP_MASK (SKC_PREFIX_SUBGROUP_SIZE - 1) + +// +// smem accumulator +// + +union skc_subgroup_accum +{ + struct { + SKC_ATOMIC_INT ttp[SKC_TILE_HEIGHT]; + } atomic; + + struct { + skc_ttp_t ttp[SKC_TILE_HEIGHT]; + } aN; + + struct { + SKC_PREFIX_TTP_V ttp[SKC_PREFIX_SUBGROUP_SIZE]; + } vN; + + struct { + SKC_PREFIX_SMEM_ZERO ttp[SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH]; + } zero; +}; + +// +// +// + +struct skc_subgroup_smem +{ + // prefix accumulator + union skc_subgroup_accum accum; +}; + +// +// +// + +static +skc_uint +skc_subgroup_lane() +{ +#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) + return get_sub_group_local_id(); +#else + return 0; +#endif +} + +// +// +// + +static +SKC_PREFIX_TTS_V_BITFIELD +skc_tts_get_dy(skc_tts_v_t const ttsv) +{ + // tts.dy is packed to fit in range [-32,31] and unpacked to [-32..-1,+1..+32] + SKC_PREFIX_TTS_V_BITFIELD const dy = ttsv >> SKC_TTS_OFFSET_DY; + + return dy - (~ttsv >> 31); +} + +static +SKC_PREFIX_TTS_V_BITFIELD +skc_tts_get_py(skc_tts_v_t const ttsv) +{ + return SKC_BFE(ttsv,SKC_TTS_BITS_TY-SKC_SUBPIXEL_RESL_Y_LOG2,SKC_TTS_OFFSET_TY+SKC_SUBPIXEL_RESL_Y_LOG2); +} + +// +// +// + +static +void +skc_accum_scatter(__local struct skc_subgroup_smem * const smem, skc_tts_v_t const tts_v) +{ + // get "altitude" + SKC_PREFIX_TTS_V_BITFIELD dy = skc_tts_get_dy(tts_v); + + // get the y pixel coordinate + SKC_PREFIX_TTS_V_BITFIELD py = skc_tts_get_py(tts_v); + + // + // FIXME -- benchmark performance of setting dy to 0 if tts_v is invalid? + // + // FIXME -- consider making TTS_INVALID a dy/py/etc. that's a no-op + // + +#if 0 + if (tts_v != SKC_TTS_INVALID) + printf("< %08X = %u : %d >\n",tts_v,py,dy); +#endif + + // + // scatter-add the "altitude" to accumulator + // +#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) + // + // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + if (tts_v C != SKC_TTS_INVALID) { \ + SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->accum.atomic.ttp + py C, dy C); \ + } + +#else + // + // CPU/SIMD -- ITERATE OVER VECTOR, NO NEED FOR ATOMICS + // + // WITH SIMD, ONCE A TTS_INVALID IS DETECTED WE CAN QUIT + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + if (tts_v C == SKC_TTS_INVALID) \ + return; \ + smem->accum.aN.ttp[py C] = dy C; +#endif + + SKC_PREFIX_TTS_VECTOR_INT_EXPAND(); +} + +// +// The implication here is that if our device configuration has a +// rectangular 1:2 tile then we need a block size of at least 2 +// subblocks. The subblock size of course needs to match the length of +// the smallest tile side. +// + +static +void +skc_accum_flush(__local struct skc_subgroup_smem * const smem, + __global skc_bp_elem_t * const bp_elems, + skc_block_id_t const pb_id) +{ + // load the ttp elements + SKC_PREFIX_TTP_V const ttp_v = smem->accum.vN.ttp[get_sub_group_local_id()]; + skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane(); + +#if ( SKC_TILE_RATIO == 1 ) + + bp_elems[offset] = ttp_v; + +#elif ( SKC_TILE_RATIO == 2 ) + + vstore2(ttp_v,offset,bp_elems); + +#else + +#error("tile ratio greater than 2 not supported") + +#endif +} + +// +// +// + +static +void +skc_accum_reset(__local struct skc_subgroup_smem * const smem) +{ + for (uint ii=0; ii<SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH / SKC_PREFIX_SUBGROUP_SIZE; ii++) + smem->accum.zero.ttp[ii * SKC_PREFIX_SUBGROUP_SIZE + skc_subgroup_lane()] = ( 0 ); +} + +// +// get next sk key +// + +static +skc_ttsk_s_t +skc_ttsk_v_get_next(skc_ttsk_v_t * const sk_v, + skc_uint * const sk_next, + skc_int * const rkpk_rem) +{ + // decrement count + *rkpk_rem -= 1; + +#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) + // + // SIMT with subgroup support is easy + // + // SIMT without subgroup support can always emulate with smem + // +#if 0 + // + // BUG TICKLED BY FILTHY CODE -- Intel compiler doesn't properly + // broadcast a uint2 cast to a long. It was probably bad to do this + // anyway without a union wrapping the TTSK scalar type. + // + // Consider creating a union { ulong; uint2 } at a later date -- + // probably no need to ever do this unless it makes broadcast faster + // which is unlikely since it will probably be implemented as 2 + // 32-bit broadcasts. + // + // Additionally, the TTRK and TTXK key bitfield sizes are probably + // cast in stone and we aren't going to change them no matter + // architecture we're on. + // + skc_ttsk_s_t sk_s = sub_group_broadcast(SKC_AS(ulong)(*sk_v),(*sk_next)++); +#else + skc_ttsk_s_t sk_s; + + sk_s.lo = sub_group_broadcast(sk_v->lo,*sk_next); + sk_s.hi = sub_group_broadcast(sk_v->hi,*sk_next); + *sk_next += 1; +#endif + +#else + // + // SIMD will always grab component .s0 and then rotate the vector + // + sk_s = ( sk_v->s0 ); + + skc_ttsk_v_rotate_down(sk_v); + +#endif + + return sk_s; +} + +// +// +// + +static +skc_raster_yx_s +skc_ttsk_v_first(skc_ttsk_v_t * const sk_v, skc_uint const sk_next) +{ +#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) + // + // SIMT with subgroup support is easy + // + // SIMT without subgroup support can always emulate with smem + // + skc_raster_yx_s const yx_s = sub_group_broadcast(sk_v->hi,sk_next); + +#else + // + // SIMD will always grab component .s0 and then rotate the vector + // + skc_raster_yx_s const yx_s = ( sk_v->s0.hi ); + +#endif + + return yx_s; +} + +// +// mask off ttsb id +// + +static +skc_block_id_s_t +skc_ttsk_s_get_ttsb_id(skc_ttsk_s_t const * const sk_s) +{ + return ( sk_s->lo & SKC_TTXK_LO_MASK_ID ); +} + +// +// load tts_v as early as possible +// + +static +skc_tts_v_t +skc_load_tts(__global skc_bp_elem_t * const bp_elems, + skc_block_id_s_t const sb_id) +{ + return ( bp_elems[sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()] ); +} + +// +// massage ttrk keys into ttsk keys +// + +static +void +skc_ttrk_to_ttsk(skc_ttsk_v_t * const sk_v) +{ + sk_v->lo = sk_v->lo & SKC_TTXK_LO_MASK_ID; // clear high (N/A) bits + sk_v->hi = sk_v->hi << SKC_TTRK_HI_BITS_COHORT; // shift cohort away -- zeroes low bits +} + +// +// replenish ttsk keys +// + +static +void +skc_ttsk_v_replenish(skc_ttsk_v_t * const sk_v, + skc_uint * const sk_next, + skc_uint * const rks_next, + __global skc_ttrk_e_t const * const rks) +{ + // if there are still keys available then return + if (*sk_next < SKC_PREFIX_TTXK_V_SIZE) + return; + + // + // otherwise, replenish sk_v + // + // NOTE NOTE NOTE -- we are assuming rks[] extent size is always + // divisible by TTXK_V_SIZE and therefore loading some keys from the + // next raster is OK. + // + *sk_next = 0; + *rks_next += SKC_PREFIX_SUBGROUP_SIZE; + *sk_v = rks[*rks_next]; + +#if 0 + printf("* %08X ( %3u, %3u )\n", + sk_v->hi, + (sk_v->hi >> 12) & 0xFFF, + (sk_v->hi ) & 0xFFF); +#endif + + skc_ttrk_to_ttsk(sk_v); + +#if 0 + printf("! %08X ( %3u, %3u )\n", + sk_v->hi, + (sk_v->hi >> 20) & 0xFFF, + (sk_v->hi >> 8) & 0xFFF); +#endif +} + +// +// replenish block ids +// +// note that you can't overrun the block id pool since it's a ring +// + +static +void +skc_blocks_replenish(skc_uint * const blocks_next, + skc_uint * const blocks_idx, + skc_block_id_v_t * const blocks, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const bp_ids) + +{ + *blocks_idx += SKC_PREFIX_BLOCK_ID_V_SIZE; + *blocks = bp_ids[*blocks_idx & bp_mask]; + *blocks_next = 0; + +#if 0 + printf("replenish blocks: %u\n",*blocks); +#endif +} + +// +// +// + +static +skc_block_id_t +skc_blocks_get_next(skc_uint * const blocks_next, + skc_uint * const blocks_idx, + skc_block_id_v_t * const blocks, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const bp_ids) +{ + // replenish? + if (*blocks_next == SKC_PREFIX_BLOCK_ID_V_SIZE) + { + skc_blocks_replenish(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); + } + +#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) + // + // SIMT + // + skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next); + +#else + // + // SIMD + // + skc_block_id_t id = blocks->s0; + + skc_shuffle_down_1(*blocks); + +#endif + + *blocks_next += 1; + + return id; +} + +// +// subblock allocator +// + +#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 ) + +static +skc_block_id_t +skc_subblocks_get_next_pb_id(skc_block_id_t * const subblocks, + skc_uint * const blocks_next, + skc_uint * const blocks_idx, + skc_block_id_v_t * const blocks, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const bp_ids) +{ + if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) + { + *subblocks = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); + } + + skc_block_id_t const pb_id = *subblocks; + + *subblocks += SKC_TILE_RATIO; // note this is one or two subblocks + + return pb_id; +} + +#endif + +// +// append a ttsk key to the work-in-progress node +// + +static +void +skc_node_v_append_sk(skc_ttsk_s_t const * const sk_s, + + skc_ttxk_v_t * const xk_v, + skc_uint * const xk_v_next, + skc_uint * const xk_v_idx, + __global skc_bp_elem_t * const bp_elems, + + skc_int const rkpk_rem, + + skc_uint * const blocks_next, + skc_uint * const blocks_idx, + skc_block_id_v_t * const blocks, + skc_uint const bp_mask, + __global skc_block_id_t const * const bp_ids) +{ + // + // Append an sk key to the in-register xk_v vector + // + // If the work-in-progress node in gmem will only have room for one + // more key then: + // + // - if this was the final SK then write out xk_v and exit + // + // - otherwise, acquire a block id, link it, write out xk_v, + // prepare new node + // + // Note that this does *not* try to squeeze in a final key into the + // next node slot. This optimization isn't worth the added + // down-pipeline complexity. + // +#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) + // + // SIMT + // + if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK)) + { + *xk_v = *sk_s; + } + + *xk_v_next += 1; + + // are there more keys coming? + if (rkpk_rem > 0) + { + // is the node almost full? + if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1) + { + skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); + + if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1) + { + xk_v->lo = id; + xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary + } + + // store xk_v (uint2) to bp (uint) + bp_elems[*xk_v_idx ] = xk_v->lo; + bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; +#if 0 + printf("S) %u : %08v2X\n",*xk_v_idx,*xk_v); +#endif + // reinitialize xk_v + xk_v->lo = SKC_UINT_MAX; + xk_v->hi = SKC_UINT_MAX; + + // update node elem idx + *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); + + // reset node count + *xk_v_next = 0; + } + // is xk_v full? + else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0) + { + // store xk_v to bp + bp_elems[*xk_v_idx ] = xk_v->lo; + bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; +#if 0 + printf("s) %u : %08v2X\n",*xk_v_idx,*xk_v); +#endif + // reinitialize xk_v + xk_v->lo = SKC_UINT_MAX; + xk_v->hi = SKC_UINT_MAX; + + // increment node elem idx + *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; + } + } + else + { + bp_elems[*xk_v_idx ] = xk_v->lo; + bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; +#if 0 + printf("z) %u : %08v2X\n",*xk_v_idx,*xk_v); +#endif + while ((*xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2) + { + *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; + + bp_elems[*xk_v_idx] = SKC_UINT_MAX; + bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX; + } + } + +#else + // + // SIMD + // + +#endif +} + +// +// +// + +static +skc_ttpk_s_t +skc_ttpk_create(skc_raster_yx_s const yx_prev, + skc_raster_yx_s const yx_next, + skc_block_id_t const pb_id) +{ + // - yx_prev is already incremented by one + // - yx_span is already shifted up at hi.x + skc_uint const yx_span = yx_next - yx_prev; + + skc_ttpk_s_t pk; + + // turn on prefix bit | shift span bits upward + pk.lo = pb_id | SKC_TTXK_LO_MASK_PREFIX | (yx_span << SKC_TTPK_LO_SHL_YX_SPAN); + + // shift down high span bits | yx of tile + pk.hi = (yx_span >> SKC_TTPK_HI_SHR_YX_SPAN) | yx_prev; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("* %08v2X : %u\n",pk,yx_span); +#endif + + return pk; +} + +// +// append a ttpk key to the work-in-progress node +// + +static +void +skc_node_v_append_pk(skc_ttpk_s_t const * const pk_s, + + skc_ttxk_v_t * const xk_v, + skc_uint * const xk_v_next, + skc_uint * const xk_v_idx, + __global skc_bp_elem_t * const bp_elems, + + skc_uint * const blocks_next, + skc_uint * const blocks_idx, + skc_block_id_v_t * const blocks, + skc_uint const bp_mask, + __global skc_block_id_t const * const bp_ids) +{ + // + // append a pk key to the in-register xk_v vector + // + // if the work-in-progress node in gmem will only have room for one + // more key then: + // + // - if this was the final SK then write out xk_v and exit + // + // - otherwise, acquire a block id, link it, write out xk_v, + // prepare new node + // +#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) + // + // SIMT + // + if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK)) + { + *xk_v = *pk_s; + } + + *xk_v_next += 1; + + // is the node almost full? + if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1) + { + skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); + + if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1) + { + xk_v->lo = id; + xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary + } + + // store xk_v to bp + bp_elems[*xk_v_idx ] = xk_v->lo; + bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; +#if 0 + printf("P) %u : %08v2X\n",*xk_v_idx,*xk_v); +#endif + // reinitialize xk_v + xk_v->lo = SKC_UINT_MAX; + xk_v->hi = SKC_UINT_MAX; + + // update node elem idx + *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); + + // reset node count + *xk_v_next = 0; + } + // is xk_v full? + else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0) + { + // store xk_v to bp + bp_elems[*xk_v_idx ] = xk_v->lo; + bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; +#if 0 + printf("p) %u : %08v2X\n",*xk_v_idx,*xk_v); +#endif + // reinitialize xk_v + xk_v->lo = SKC_UINT_MAX; + xk_v->hi = SKC_UINT_MAX; + + // increment node elem idx + *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; + } + +#else + // + // SIMD + // +#endif +} + +// +// append the first 3 fields of meta info to the raster header +// + +static +void +skc_node_v_init_header(skc_ttxk_v_t * const xk_v, + skc_uint * const xk_v_next, + union skc_raster_cohort_meta_out const * const meta) +{ +#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) + // + // SIMT + // + if (get_sub_group_local_id() < 2) + { + *xk_v = ((get_sub_group_local_id() & 1) == 0) ? meta->u32v4.lo : meta->u32v4.hi; + } + +#if 0 + if (get_sub_group_local_id() == 0) + printf("header: %08v4X\n",meta->u32v4); +#endif + + // + // increment counter: uint4 + uint4 = uint2 x 4 + // + *xk_v_next = 2 + 2; // +2 for unitialized bounds + +#else + // + // SIMD + // + +#endif +} + +// +// +// + +__kernel +SKC_PREFIX_KERNEL_ATTRIBS +void +skc_kernel_prefix(__global skc_uint const * const bp_atomics, + __global skc_block_id_t const * const bp_ids, + __global skc_bp_elem_t * const bp_elems, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_ttrk_e_t const * const rks, + __global skc_block_id_t * const map, + __global skc_uint const * const metas, + skc_uint const count) +{ + // + // declare shared memory block + // +#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 ) + __local struct skc_subgroup_smem smem[1]; +#else + __local struct skc_subgroup_smem smems[SKC_PREFIX_WORKGROUP_SUBGROUPS]; + __local struct skc_subgroup_smem * restrict const smem = smems + get_sub_group_id(); +#endif + + // + // where is this subgroup in the grid? + // +#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 ) + skc_uint const sgi = get_group_id(0); +#else + skc_uint const sgi = get_group_id(0) * SKC_PREFIX_WORKGROUP_SUBGROUPS + get_sub_group_id(); +#endif + + skc_uint const sgl = get_sub_group_local_id(); + + // + // return if this subgroup is excess + // +#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS > 1 ) + if (sgi >= count) + return; +#endif + + // + // get meta info for this subgroup's raster + // + union skc_raster_cohort_meta_out const meta = { vload4(sgi,metas) }; + skc_uint const reads = metas[SKC_RASTER_COHORT_META_OFFSET_READS + sgi]; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("%3u : %5u / %5u / %5u / %5u / %u\n", + sgi, + meta.blocks, + meta.offset, + meta.nodes, + meta.keys, + reads); +#endif + + // + // preload blocks -- align on subgroup + // + skc_uint blocks_idx = (reads & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane(); + skc_block_id_v_t blocks = bp_ids[blocks_idx & bp_mask]; + skc_uint blocks_next = (reads & SKC_PREFIX_SUBGROUP_MASK); + + // + // prime xk_v_idx with a block but note that OpenCL vstore_n() will scale the offset + // + skc_uint xk_v_idx = sub_group_broadcast(blocks,blocks_next++) * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); + + // + // initialize raster header -- assumes block is greater than 8 words (4 doublewords) + // + skc_ttxk_v_t xk_v = { SKC_UINT_MAX, SKC_UINT_MAX }; + skc_uint xk_v_next; + + skc_node_v_init_header(&xk_v,&xk_v_next,&meta); + + // + // no keys -- this is an empty raster! + // + if (meta.keys == 0) + { + bp_elems[xk_v_idx ] = xk_v.lo; + bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v.hi; + + while ((xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2) + { + xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; + + bp_elems[xk_v_idx] = SKC_UINT_MAX; + bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX; + } + + return; + } + + // + // load TTRK keys and in-place convert to TTSK keys + // + skc_uint rks_next = (meta.offset & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane(); + skc_ttsk_v_t sk_v = rks[rks_next]; + skc_uint sk_next = (meta.offset & SKC_PREFIX_SUBGROUP_MASK); + skc_int rkpk_rem = meta.keys; // signed count of remaining rk+pk keys + +#if 0 + printf("* %08X ( %3u, %3u )\n", + sk_v.hi, + (sk_v.hi >> 12) & 0xFFF, + (sk_v.hi ) & 0xFFF); +#endif + + skc_ttrk_to_ttsk(&sk_v); + +#if 0 + printf("! %08X ( %3u, %3u )\n", + sk_v.hi, + (sk_v.hi >> 20) & 0xFFF, + (sk_v.hi >> 8) & 0xFFF); +#endif + + // + // subblocks + // +#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 ) + skc_block_id_t subblocks = 0; +#endif + + // + // begin "scan" of tiles + // + skc_raster_yx_s yx_prev = skc_ttsk_v_first(&sk_v,sk_next); + + // + // zero the accumulator + // + skc_accum_reset(smem); + + while (true) + { + // get next rk key + skc_ttsk_s_t const sk_s = skc_ttsk_v_get_next(&sk_v,&sk_next,&rkpk_rem); + + // load ttsb id + skc_block_id_s_t const sb_id = skc_ttsk_s_get_ttsb_id(&sk_s); + + // load tts_v transaction "in flight" as early as possible + skc_tts_v_t const tts_v = skc_load_tts(bp_elems,sb_id); + +#if 0 + printf("{ %08X }\n",tts_v); +#endif + +#if 0 + if (get_sub_group_local_id() == 0) + printf("[ %d, %X ]\n",rkpk_rem,sb_id); +#endif + +#if 0 + if (get_sub_group_local_id() == 0) + printf("@ %08X ( %3u, %3u )\n",sk_s.hi,(sk_s.hi >> 20),(sk_s.hi >> 8) & 0xFFF); +#endif + + // + // FIXME -- SOME OF THESE COMPARISONS CAN BE PERFORMED AHEAD OF + // TIME AND SIMD'IZED + // + + // if yx's don't match then we're either issuing a ttpk or + // resetting the accumulator + if (sk_s.hi != yx_prev) + { + // if yx_next.y == yx_last.y then x changed + if (((sk_s.hi ^ yx_prev) & SKC_TTXK_HI_MASK_Y) == 0) + { + // + // if the tile is not square then it's ratio is 1:2 + // +#if SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 + skc_block_id_t const pb_id = skc_subblocks_get_next_pb_id(&subblocks, + &blocks_next, + &blocks_idx, + &blocks, + bp_mask, + bp_ids); +#else + skc_block_id_t const pb_id = skc_blocks_get_next(&blocks_next, + &blocks_idx, + &blocks, + bp_mask, + bp_ids); +#endif + + // flush accumulated ttp vector to block/subblock at ttpb_id + skc_accum_flush(smem,bp_elems,pb_id); + +#if 0 + if (get_sub_group_local_id() == 0) + { + printf("%8u : ( %4u, %4u ) -> ( %4u, %4u )\n", + pb_id, + (yx_prev >> SKC_TTXK_HI_OFFSET_Y), + (yx_prev >> SKC_TTXK_HI_OFFSET_X) & 0xFFF, + (sk_s.hi >> SKC_TTXK_HI_OFFSET_Y) & 0xFFF, + (sk_s.hi >> SKC_TTXK_HI_OFFSET_X) & 0xFFF); + } +#endif + + // + // FIXME -- A SIMD-WIDE BLOCK OF TTPK KEYS CAN BE CREATED IN ONE STEP + // + rkpk_rem -= 1; + + // create the pk + skc_ttpk_s_t const pk_s = skc_ttpk_create(yx_prev+SKC_TTXK_HI_ONE_X,sk_s.hi,pb_id); + + // append pk key to xk buffer + skc_node_v_append_pk(&pk_s, + + &xk_v, + &xk_v_next, + &xk_v_idx, + bp_elems, + + &blocks_next, + &blocks_idx, + &blocks, + bp_mask, + bp_ids); + } + else if (rkpk_rem > 0) // we're starting a new tile row + { + skc_accum_reset(smem); + } + } + + // + // append sk key to node_v + // + // if rkpk_rem is zero then return from kernel + // + skc_node_v_append_sk(&sk_s, + + &xk_v, + &xk_v_next, + &xk_v_idx, + bp_elems, + + rkpk_rem, + + &blocks_next, + &blocks_idx, + &blocks, + bp_mask, + bp_ids); + + // we're done if no more sk keys + if (rkpk_rem == 0) + break; + + // move to new tile + yx_prev = sk_s.hi; + + // scatter tts values into accumulator + skc_accum_scatter(smem,tts_v); + + // replenish sk keys + skc_ttsk_v_replenish(&sk_v,&sk_next,&rks_next,rks); + } +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/rasterize.cl b/src/compute/skc/platforms/cl_12/kernels/rasterize.cl index e622845d9c..f20f6456b3 100644 --- a/src/compute/skc/platforms/cl_12/kernels/rasterize.cl +++ b/src/compute/skc/platforms/cl_12/kernels/rasterize.cl @@ -1,3366 +1,3366 @@ -/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "tile.h"
-#include "common.h"
-#include "atomic_cl.h"
-#include "block_pool_cl.h"
-#include "raster_builder_cl_12.h"
-#include "device_cl_12.h"
-
-// #define SKC_ARCH_AVX2
-// #define SKC_RASTERIZE_SIMD_USES_SMEM
-
-#define PRINTF_ENABLE 0
-#define PRINTF_BLOCK_COUNT 0
-
-//
-// NOTE:
-//
-// ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT
-// AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE
-//
-// NOTE:
-//
-// IGNORE FOR NOW ANY AVX2 CODE SNIPPETS. THEY WILL BE MOVED ASAP.
-//
-//
-
-#if 0 // SKC_ARCH_AVX2
-
-// #define SKC_RASTERIZE_SUBGROUP_SIZE 1
-// #define SKC_RASTERIZE_VECTOR_SIZE_LOG2 3
-// #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP 1
-
-// #define SKC_TTXB_WORDS 8
-
-// #define SKC_RASTERIZE_FLOAT float8
-// #define SKC_RASTERIZE_UINT uint8
-// #define SKC_RASTERIZE_INT int8
-// #define SKC_RASTERIZE_PREDICATE int8
-
-// #define SKC_RASTERIZE_BIN_BLOCK uint16
-// #define SKC_RASTERIZE_BIN uint8
-
-// #define SKC_RASTERIZE_POOL uint8
-// #define SKC_RASTERIZE_POOL_SCALE 6
-
-// #define SKC_RASTERIZE_TILE_HASH_X_BITS 1
-// #define SKC_RASTERIZE_TILE_HASH_Y_BITS 2
-
-// #define SKC_RASTERIZE_VECTOR_EXPAND() SKC_EXPAND_8()
-
-#endif
-
-//
-// SIMT
-//
-
-#define SKC_RASTERIZE_BLOCK_ID_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE
-#define SKC_RASTERIZE_TTSK_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE
-#define SKC_RASTERIZE_TTSK_V_MASK (SKC_RASTERIZE_TTSK_V_SIZE - 1)
-
-//
-//
-//
-
-#define SKC_RASTERIZE_VECTOR_SIZE (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2)
-#define SKC_RASTERIZE_ELEMS_PER_SUBGROUP (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE)
-
-//
-//
-//
-
-#define SKC_RASTERIZE_YX_INIT 0x7FFF7FFF // { +32767, +32767 }
-#define SKC_RASTERIZE_YX_INVALID 0x80008000 // { -32768, -32768 }
-
-//
-//
-//
-
-#define SKC_RASTERIZE_TILE_HASH_X_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS)
-#define SKC_RASTERIZE_TILE_HASH_Y_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS)
-#define SKC_RASTERIZE_TILE_HASH_BITS (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS)
-#define SKC_RASTERIZE_TILE_HASH_BIN_COUNT (1 << SKC_RASTERIZE_TILE_HASH_BITS)
-#define SKC_RASTERIZE_TILE_HASH_BIN_BITS (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT)
-#define SKC_RASTERIZE_TILE_HASH_BIN_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS)
-
-//
-// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
-//
-// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
-//
-// Lerp in two fma/mad ops:
-//
-// t * b + ((-t) * a + a)
-//
-// Note: OpenCL documents mix() as being implemented as:
-//
-// a + (b - a) * t
-//
-// But this may be a native instruction on some devices. For example,
-// on GEN9 there is an LRP "linear interoplation" opcode but it
-// doesn't appear to support half floats.
-//
-// Feel free to toggle this option and then benchmark and inspect the
-// generated code. We really want the double FMA to be generated when
-// there isn't support for a LERP/MIX operation.
-//
-
-#if 1
-#define SKC_LERP(a,b,t) mad(t,b,mad(-(t),a,a))
-#else
-#define SKC_LERP(a,b,t) mix(a,b,t)
-#endif
-
-//
-// There is no integer MAD in OpenCL with "don't care" overflow
-// semantics.
-//
-// FIXME -- verify if the platform needs explicit MAD operations even
-// if a "--fastmath" option is available at compile time. It might
-// make sense to explicitly use MAD calls if the platform requires it.
-//
-
-#if 1
-#define SKC_MAD_UINT(a,b,c) ((a) * (b) + (c))
-#else
-#define SKC_MAD_UINT(a,b,c) mad_sat(a,b,c)
-#endif
-
-//
-//
-//
-
-#define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane())
-
-//
-//
-//
-
-union skc_bp_elem
-{
- skc_uint u32;
- skc_tagged_block_id_t tag_id;
- skc_float coord;
-};
-
-//
-//
-//
-
-struct skc_subgroup_smem
-{
- //
- // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member
- //
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM )
- struct {
- union {
-
- skc_uint winner;
-
- struct {
- skc_uint scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
- } aN;
-
- struct {
- SKC_RASTERIZE_UINT scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
- } vN;
- };
- } subgroup;
-#endif
-
- //
- // work-in-progress TTSB blocks and associated YX keys
- //
- union {
- struct {
- // FIXME -- some typedefs are valid here
- skc_uint ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS];
- skc_uint yx [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
- skc_uint id [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
- skc_uint count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
- } aN;
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
- struct {
- SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
- SKC_RASTERIZE_BIN yx;
- SKC_RASTERIZE_BIN id;
- SKC_RASTERIZE_BIN count;
- } vN;
-#endif
- } bin;
-};
-
-//
-//
-//
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-#define skc_subgroup_lane() 0
-#else
-#define skc_subgroup_lane() get_sub_group_local_id()
-#endif
-
-//
-// replenish block ids
-//
-// note that you can't overrun the block id pool since it's a ring
-//
-
-static
-void
-skc_blocks_replenish(skc_uint * const blocks_next,
- skc_block_id_v_t * const blocks,
- __global SKC_ATOMIC_UINT volatile * const bp_atomics,
- skc_uint const bp_mask, // pow2 modulo mask for block pool ring
- __global skc_block_id_t const * const bp_ids)
-{
- //
- // get a new vector of block ids -- this is kind of a narrow
- // allocation but subblocks help stretch out the pool.
- //
- // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids
- //
- skc_uint bp_idx = 0;
-
- if (skc_subgroup_lane() == 0)
- {
- bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,
- SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads
-#if 0
- printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE);
-#endif
- }
-
- bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask;
- *blocks = bp_ids[bp_idx];
- *blocks_next = 0;
-}
-
-//
-//
-//
-
-static
-skc_block_id_t
-skc_blocks_get_next(skc_uint * const blocks_next,
- skc_block_id_v_t * const blocks,
- __global SKC_ATOMIC_UINT volatile * const bp_atomics,
- skc_uint const bp_mask, // pow2 modulo mask for block pool ring
- __global skc_block_id_t const * const bp_ids)
-{
- // replenish?
- if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE)
- {
- skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
- }
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 )
- //
- // SIMT
- //
- skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);
-
-#else
- //
- // SIMD
- //
- skc_block_id_t id = blocks->s0;
-
- skc_shuffle_down_1(*blocks);
-
-#endif
-
- *blocks_next += 1;
-
- return id;
-}
-
-//
-// subblock allocator
-//
-
-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
-
-static
-skc_block_id_t
-skc_subblocks_get_next(skc_block_id_t * const subblocks,
- skc_uint * const blocks_next,
- skc_block_id_v_t * const blocks,
- __global SKC_ATOMIC_UINT volatile * const bp_atomics,
- skc_uint const bp_mask, // pow2 modulo mask for block pool ring
- __global skc_block_id_t const * const bp_ids)
-{
- if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
- {
- *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
- }
-
- skc_block_id_t const sb_id = *subblocks;
-
- *subblocks += 1;
-
-#if 0
- if (get_sub_group_local_id() == 0)
- printf("= %u\n",sb_id);
-#endif
-
- return sb_id;
-}
-
-
-#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks
-#define SKC_SUBBLOCKS_BLOCKS_ARGS() subblocks, blocks
-
-#else
-
-#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks
-#define SKC_SUBBLOCKS_BLOCKS_ARGS() blocks
-
-#endif
-
-//
-//
-//
-
-static
-skc_block_id_t
-skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(),
- skc_uint * const blocks_next,
- __global SKC_ATOMIC_UINT volatile * const bp_atomics,
- skc_uint const bp_mask, // pow2 modulo mask for block pool ring
- __global skc_block_id_t const * const bp_ids,
- __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
- skc_ttsk_v_t * const sk_v,
- skc_uint * const sk_v_next,
- __global skc_ttsk_s_t * const sk_extent,
- skc_uint const new_yx)
-{
-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
- skc_block_id_t const new_id = skc_subblocks_get_next(subblocks,
- blocks_next,
- blocks,
- bp_atomics,
- bp_mask,
- bp_ids);
-#else
- skc_block_id_t const new_id = skc_blocks_get_next(blocks_next,
- blocks,
- bp_atomics,
- bp_mask, // pow2 modulo mask for block pool ring
- bp_ids);
-#endif
-
- if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK))
- {
- sk_v->lo = new_id;
- sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx;
-#if 0
- printf("@ ( %3u, %3u ) %u\n",
- (new_yx >> 12) & 0xFFF,
- (new_yx ) & 0xFFF,
- new_id);
-#endif
- }
-
- *sk_v_next += 1;
-
- if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE)
- {
- *sk_v_next = 0;
-
- skc_uint sk_idx = 0;
-
- if (skc_subgroup_lane() == 0)
- {
- sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
- (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE);
-#if 0
- printf("+ %u\n",sk_idx);
-#endif
- }
-
- sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE )
- if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE)
-#endif
- {
- sk_extent[sk_idx] = *sk_v;
-#if 0
- printf("> %u : %v2u\n",sk_idx,*sk_v);
-#endif
- }
- }
-
- return new_id;
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
- //
- // SIMD
- //
- // Note that there isn't a built-in horizontal scan for vectors so
- // we'll define some here for various widths.
- //
- // FIXME -- a scalar version might be faster so put in a
- // compile-time switch to selection between implementations
- //
-
-#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
- return v;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
- // 01
- // 0 +
- // --
- // 01
- SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v);
- return w;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
- // 0123
- // 012 +
- // ----
- // 0123
- // 01 +
- // ----
- // 0123
- //
- SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v);
- SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w);
- return x;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
- // 01234567
- // 0123456 +
- // --------
- // 01234567
- // 012345 +
- // --------
- // 01234567
- // 0123 +
- // --------
- // 01234567
- //
- SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v);
- SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w);
- SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x);
- return y;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
- // 0123456789abcdef
- // 0123456789abcde +
- // ----------------
- // 0123456789abcdef
- // 0123456789abcd +
- // ----------------
- // 0123456789abcdef
- // 0123456789ab +
- // ----------------
- // 0123456789abcdef
- // 01234567 +
- // ----------------
- // 0123456789abcdef
- //
- SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
- SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
- SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
- SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
- return z;
-
-#endif
-
-#else
- //
- // SIMT
- //
-
- return sub_group_scan_inclusive_add(v);
-
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_UINT
-skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
- //
- // SIMD
- //
- // Note that there isn't a built-in horizontal scan for vectors so
- // we'll define some here for various widths.
- //
- // FIXME -- a scalar version might be faster so put in a
- // compile-time switch to selection between implementations
- //
-
-#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
- return v;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
- // 01
- // 0 +
- // --
- // 01
- SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v);
- return w;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
- // 0123
- // 012 +
- // ----
- // 0123
- // 01 +
- // ----
- // 0123
- //
- SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v);
- SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w);
- return x;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
- // 01234567
- // 0123456 +
- // --------
- // 01234567
- // 012345 +
- // --------
- // 01234567
- // 0123 +
- // --------
- // 01234567
- //
- SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v);
- SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w);
- SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x);
- return y;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
- // 0123456789abcdef
- // 0123456789abcde +
- // ----------------
- // 0123456789abcdef
- // 0123456789abcd +
- // ----------------
- // 0123456789abcdef
- // 0123456789ab +
- // ----------------
- // 0123456789abcdef
- // 01234567 +
- // ----------------
- // 0123456789abcdef
- //
- SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
- SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
- SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
- SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
- return z;
-
-#endif
-
-#else
- //
- // SIMT
- //
-
- return sub_group_scan_inclusive_add(v);
-
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_UINT
-skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
- //
- // SIMD
- //
- // Note that there isn't a built-in horizontal scan for vectors so
- // we'll define some here for various widths.
- //
- // FIXME -- a scalar version might be faster so put in a
- // compile-time switch to selection between implementations
- //
-
-#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
- return v;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
- // 01
- // 00 max
- // --
- // 01
- SKC_RASTERIZE_UINT const w = max(v.s00,v);
- return w;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
- // 0123
- // 0012 +
- // ----
- // 0123
- // 0101 +
- // ----
- // 0123
- //
- SKC_RASTERIZE_UINT const w = max(v.s0012,v);
- SKC_RASTERIZE_UINT const x = max(w.s0101,w);
- return x;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
- // 01234567
- // 00123456 +
- // --------
- // 01234567
- // 01012345 +
- // --------
- // 01234567
- // 01230123 +
- // --------
- // 01234567
- //
- SKC_RASTERIZE_UINT const w = max(v.s00123456,v);
- SKC_RASTERIZE_UINT const x = max(w.s01012345,w);
- SKC_RASTERIZE_UINT const y = max(x.s01230123,x);
- return y;
-
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
- // 0123456789abcdef
- // 00123456789abcde +
- // ----------------
- // 0123456789abcdef
- // 010123456789abcd +
- // ----------------
- // 0123456789abcdef
- // 01230123456789ab +
- // ----------------
- // 0123456789abcdef
- // 0123456701234567 +
- // ----------------
- // 0123456789abcdef
- //
- SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v);
- SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w);
- SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x);
- SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y);
- return z;
-
-#endif
-
-#else
- //
- // SIMT
- //
-
- return sub_group_scan_inclusive_max(v);
-
-#endif
-}
-
-//
-//
-//
-
-static
-float
-skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
- //
- // SIMD
- //
-#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
- return v;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
- return v.s1;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
- return v.s3;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
- return v.s7;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
- return v.sf;
-#endif
-
-#else
- //
- // SIMT
- //
- return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);
-
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_UINT
-skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
- //
- // SIMD
- //
-#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
- return v;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
- return v.s1;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
- return v.s3;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
- return v.s7;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
- return v.sf;
-#endif
-
-#else
- //
- // SIMT
- //
- return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);
-
-#endif
-}
-
-//
-//
-//
-
-static
-float
-skc_subgroup_first(SKC_RASTERIZE_FLOAT const v)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
- //
- // SIMD
- //
-#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
- return v;
-#else
- return v.s0;
-#endif
-
-#else
- //
- // SIMT
- //
- return sub_group_broadcast(v,0);
-
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v,
- SKC_RASTERIZE_UINT const i)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
- //
- // SIMD
- //
-#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
- return v;
-#else
- return shuffle(v,i);
-#endif
-
-#else
- //
- // SIMT
- //
- return intel_sub_group_shuffle(v,i);
-
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous
- SKC_RASTERIZE_FLOAT const c) // current
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
- //
- // SIMD
- //
- // FIXME -- there are alternative formulations here:
- //
- // Option 1:
- //
- // select(c.rotate(+1),p.rotate(-1),(1,0,0,...))
- //
- // Option 2:
- //
- // p is a scalar
- // t = c.rotate(+1)
- // t.s0 = p;
- //
- // Option 3: ...
- //
-#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
- return p;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
- return shuffle2(p,c,(uint2)(1,2));
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
- return shuffle2(p,c,(uint4)(3,4,5,6));
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
- return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14));
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
- return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30));
-#endif
-
-#else
- //
- // SIMT
- //
- return intel_sub_group_shuffle_up(p,c,1);
-
-#endif
-}
-
-//
-//
-//
-
-static
-bool
-skc_is_lane_first()
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
- //
- // SIMD
- //
- return true;
-#else
- //
- // SIMT
- //
- return get_sub_group_local_id() == 0;
-#endif
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_delta_offset()
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
- //
- // SIMD
- //
-#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
- return 1;
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
- return (SKC_RASTERIZE_FLOAT)( 1, 2 );
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
- return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 );
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
- return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 );
-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
- return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 );
-#endif
-
-#else
- //
- // SIMT
- //
- return 1.0f + get_sub_group_local_id();
-
-#endif
-
-}
-
-//
-//
-//
-
-static
-int
-skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p)
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
- //
- // SIMD
- //
- return any(p);
-#else
- //
- // SIMT
- //
- return sub_group_any(p);
-#endif
-}
-
-//
-//
-//
-
-#define SKC_PATH_NODEWORD_IS_LAST(n) (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK)
-
-void
-skc_segment_next(__global union skc_bp_elem * const bp_elems,
- skc_uint * const nodeword,
- skc_block_id_t * const id)
-{
- if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
- {
- if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword))
- {
- *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS;
- }
-
- skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id;
-
- *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
- }
-}
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y)
-{
- return native_sqrt(x * x + y * y);
-}
-
-//
-// Wang's Formula (1985)
-//
-
-#define SKC_WANG_PIXEL_RESL 0.25f // <-- this can be tuned
-
-#define SKC_WANG_EPSILON (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32)
-
-#define SKC_WANG_CUBIC ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON))
-#define SKC_WANG_QUADRATIC ((2.0f ) / (8.0f * SKC_WANG_EPSILON))
-
-#define SKC_WANG_LENGTH(x,y) skc_native_length(x,y)
-#define SKC_WANG_SQRT(x) native_sqrt(x)
-
-//
-//
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
- SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
- SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y,
- SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y)
-{
- //
- // Return the number of evenly spaced (in the parametric sense) line
- // segments that are guaranteed to be within "epsilon" error of the
- // curve.
- //
- // We're then going to take multiples of the reciprocal of this
- // number so that the segmentation can be distributed across the
- // subgroup.
- //
- // Note, this can probably be slightly optimized per architecture
- // but it's probably far from being a hotspot since it's all
- // straight-line unpredicated code.
- //
- // The result is an integer ranging from [1.0,#segments]
- //
- // Note that even if all of the control points are coincident, the
- // max(1.0f) will categorize this as a line of 1 segment.
- //
- // This is what we want! We want to convert cubics to lines as
- // easily as possible and *then* cull lines that are either
- // horizontal or zero length.
- //
- return max(1.0f,
- ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC *
- SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x),
- fabs(t3x - 2.0f * t2x + t1x)),
- max(fabs(t2y - 2.0f * t1y + t0y),
- fabs(t3y - 2.0f * t2y + t1y))))));
-}
-
-static
-SKC_RASTERIZE_FLOAT
-skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
- SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
- SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y)
-{
- return max(1.0f,
- ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC *
- SKC_WANG_LENGTH(fabs(t2x - 2.0f * t1x + t0x),
- fabs(t2y - 2.0f * t1y + t0y)))));
-}
-
-//
-// rational curves
-//
-
-static
-SKC_RASTERIZE_FLOAT
-skc_wangs_formula_cubic_rat()
-{
- return 0.0f;
-}
-
-static
-SKC_RASTERIZE_FLOAT
-skc_wangs_formula_quad_rat()
-{
- return 0.0f;
-}
-
-//
-// flush any work-in-progress blocks and return unused block ids
-//
-
-static
-void
-skc_finalize(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
- __global union skc_bp_elem * const bp_elems,
- __global uint * const bp_ids,
- skc_uint const bp_mask,
- __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
- skc_block_id_v_t * const blocks,
- skc_uint const blocks_next,
- skc_ttsk_v_t * const sk_v,
- skc_uint const sk_v_next,
- __global skc_ttsk_s_t * const sk_extent,
- __local struct skc_subgroup_smem volatile * const smem)
-{
- //
- // flush non-empty bins
- //
- // FIXME -- accelerate this iteration/search with a subgroup operation
- //
- for (skc_uint ii=0; ii<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; ii++)
- {
- if (smem->bin.aN.count[ii] > 0)
- {
- skc_block_id_v_t const id = smem->bin.aN.id[ii];
- skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
- skc_uint const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()];
-#if 0
- printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts);
-#endif
- bp_elems[idx].u32 = tts;
- }
-
- //
- // FIXME -- vectorize with vstoreN()
- //
- }
-
- //
- // return remaining block ids back to the pool
- //
- skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next;
-
- if (blocks_rem > 0)
- {
- skc_uint bp_idx = 0;
-
- if (skc_subgroup_lane() == 0)
- {
- bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem);
-
-#if 0
- printf("r-: %8u + %u\n",bp_idx,blocks_rem);
-#endif
- }
-
- bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask;
-
- if (skc_subgroup_lane() >= blocks_next)
- {
- bp_ids[bp_idx] = *blocks;
- }
- }
-
- //
- // flush work-in-progress ryx keys
- //
- if (sk_v_next > 0)
- {
- skc_uint sk_idx = 0;
-
- if (skc_subgroup_lane() == 0)
- {
- sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
- (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next);
-#if 0
- printf("* %u\n",sk_idx);
-#endif
- }
-
- sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();
-
- if (skc_subgroup_lane() < sk_v_next)
- {
- sk_extent[sk_idx] = *sk_v;
- }
- }
-}
-
-//
-// If there are lanes that were unable to append to a bin because
-// their hashes collided with a bin's current ryx key then those bins
-// must be ejected.
-//
-// Note that we do not eject "full" bins because lazily waiting for a
-// collision results in simpler code.
-//
-
-static
-void
-skc_flush(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
- __global union skc_bp_elem * const bp_elems,
- __global uint * const bp_ids,
- skc_uint const bp_mask,
- __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
- skc_block_id_t * const subblocks,
- skc_block_id_v_t * const blocks,
- skc_uint * const blocks_next,
- skc_ttsk_v_t * const sk_v,
- skc_uint * const sk_v_next,
- __global skc_ttsk_s_t * const sk_extent,
- __local struct skc_subgroup_smem volatile * const smem,
- SKC_RASTERIZE_UINT const hash,
- SKC_RASTERIZE_UINT const yx,
- SKC_RASTERIZE_PREDICATE is_collision) // pass by value
-{
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
- //
- // SIMD
- //
-
- //
- // FIXME -- this code is now stale with the changes to the
- // subblock/block allocation strategy
- //
-
- //
- // get local TTSB ID queue count
- //
- skc_uint ttsb_id_count = smem->pool.count; // scalar
-
- // init hash bit mask
- skc_uint component_mask = 0;
-
- for (int cc=0; cc<SKC_RASTERIZE_VECTOR_SIZE; cc++)
- {
- // if no collision continue
- if (((int*)&is_collision)[cc] == 0)
- continue;
-
- uint const winner = ((uint*)&hash)[cc];
- uint const component_bit = 1u << winner;
-
- // if already processed this hash then continue
- if (component_mask & component_bit)
- continue;
-
- // update component mask
- component_mask |= component_bit;
-
- //
- // new winner requires ejecting the old TTSB
- //
- if (smem->bin.aN.count[winner] > 0)
- {
- skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
-
- bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
- }
-
- //
- // ensure there is at least one TTSK and TTSB ID
- //
- if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE)
- {
- //
- // update remaining count
- //
- ttsb_id_count = 0;
-
- //
- // flush accumulated ttsk_ryx keys
- //
- uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
- (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count
-
-#if 0
- printf("# %u\n",idx);
-#endif
-
- for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
- {
- ttsk_ryx[idx + ii] = skc_make_ttsk_ryx(smem,SKC_CMD_RASTERIZE_GET_COHORT(cmd),ii);
- }
-
- //
- // allocate more ttsb ids from pool
- //
- uint const id = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+0,SKC_RASTERIZE_POOL_SIZE); // ring_reads
-
- for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
- smem->pool.aN.id[ii] = bp_ids[id + ii];
- }
-
- //
- // invalidate the winning block
- //
-
- //
- // update bin with winning yx, new ttsb id and zero count
- //
- // all lanes are loading/storing from/to the same index
- //
- smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID );
- smem->bin.aN.id [winner] = smem->pool.aN.id[ttsb_id_count];
- smem->bin.aN.yx [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc];
- smem->bin.aN.count[winner] = 0;
-
- //
- // update count
- //
- ttsb_id_count += 1;
- }
-
- //
- // save count
- //
- smem->pool.count = ttsb_id_count;
-
-#else
- //
- // SIMT
- //
-
- do {
- //
- // only one lane will win!
- //
- if (is_collision)
- smem->subgroup.winner = hash;
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- //
- // which bin is being ejected?
- //
- skc_uint const winner = smem->subgroup.winner;
-
- //
- // which colliding hash is taking over the bin?
- //
- SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner);
-
- //
- // all lanes with the same hash will try to store but only one
- // lane will win
- //
- if (is_winner)
- smem->subgroup.winner = yx;
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- //
- // flush this block to the pool
- //
- if (smem->bin.aN.count[winner] > 0)
- {
- skc_block_id_v_t const id = smem->bin.aN.id[winner];
- skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
- skc_uint const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
-#if 0
- printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts);
-#endif
- bp_elems[idx].u32 = tts;
- }
-
- //
- // append new ttsk
- //
- skc_uint const new_yx = smem->subgroup.winner;
- skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(),
- blocks_next,
- bp_atomics,
- bp_mask, // pow2 modulo mask for block pool ring
- bp_ids,
- cohort_atomics,
- sk_v,
- sk_v_next,
- sk_extent,
- new_yx);
-
-#if 0
- if (get_sub_group_local_id() == 0) {
- printf(">>> %9u\n",new_id);
- }
-#endif
-
- //
- // update bin with winning yx, new ttsb id and zero count
- //
- smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID;
- smem->bin.aN.yx [winner] = new_yx;
- smem->bin.aN.id [winner] = new_id;
- smem->bin.aN.count[winner] = 0;
-
- //
- // remove all lanes matching this hash
- //
- is_collision = is_collision && !is_winner;
-
- //
- // exit if nothing left to do
- //
- } while (sub_group_any(is_collision));
-
-#endif
-}
-
-//
-// scatter scan max
-//
-static
-SKC_RASTERIZE_UINT
-skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem,
- SKC_RASTERIZE_FLOAT const iss,
- SKC_RASTERIZE_FLOAT const ess)
-{
- //
- // prefix sums determine which lanes we're going to work on next
- //
- SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP);
- SKC_RASTERIZE_UINT const scratch_idx = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f));
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
- //
- // SIMD
- //
-#ifdef SKC_RASTERIZE_SIMD_USES_SMEM
- //
- // SIMD APPROACH 1: SIMT'ISH
- //
-
- // zero the volatile smem scratchpad using vector syntax
- smem->subgroup.vN.scratch[0] = ( 0 );
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) \
- if (is_scratch_store C) \
- smem->subgroup.aN.scratch[scratch_idx C] = I;
-
- SKC_RASTERIZE_VECTOR_EXPAND();
-
- // propagate lanes to right using max scan
- SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0];
- SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch);
-
-#else
- //
- // SIMD APPROACH 2: SCALAR'ISH
- //
-
- SKC_RASTERIZE_UINT source = ( 0 );
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) \
- if (is_scratch_store C) \
- ((uint *)&source)[scratch_idx C] = I;
-
- SKC_RASTERIZE_VECTOR_EXPAND();
-
- for (uint ii=1; ii<SKC_RASTERIZE_ELEMS_PER_SUBGROUP; ii++)
- ((uint *)&source)[ii] = max(((uint *)&source)[ii-1],((uint *)&source)[ii]);
-#endif
-
-#else
- //
- // SIMT
- //
-
- //
- // zero the volatile smem scratchpad using vector syntax
- //
- smem->subgroup.vN.scratch[skc_subgroup_lane()] = ( 0 );
-
- //
- // store source lane at starting lane
- //
- if (is_scratch_store)
- smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane();
-
- //
- // propagate lanes to right using max scan
- //
- SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()];
- SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch);
-#endif
-
- return source;
-}
-
-//
-// sliver lines into subpixels
-//
-
-static
-void
-skc_sliver(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
- __global union skc_bp_elem * const bp_elems,
- __global uint * const bp_ids,
- skc_uint const bp_mask,
- __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
- skc_block_id_t * const subblocks,
- skc_block_id_v_t * const blocks,
- skc_uint * const blocks_next,
- skc_ttsk_v_t * const sk_v,
- skc_uint * const sk_v_next,
- __global skc_ttsk_s_t * const sk_extent,
- __local struct skc_subgroup_smem volatile * const smem,
- SKC_RASTERIZE_FLOAT const l0x,
- SKC_RASTERIZE_FLOAT const l0y,
- SKC_RASTERIZE_FLOAT const l1x,
- SKC_RASTERIZE_FLOAT const l1y)
-{
- //
- // Y-SLIVERING
- // -----------
- //
- // immediately sliver all multi-pixel lines in into 1-pixel high
- // lines
- //
- // note this implicitly squelches horizontal lines
- //
- // there is another test for horizontal lines after x-slivering
- // is complete
- //
-
- //
- // will we need to flip the sign of y_delta ?
- //
- SKC_RASTERIZE_PREDICATE const y_lt = (l0y <= l1y);
- SKC_RASTERIZE_UINT const dy_xor = y_lt ? 0 : 0x80000000;
-
- //
- // save 1/dy
- //
- SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y);
-
- //
- // how many non-horizontal subpixel y-axis slivers are there?
- //
- SKC_RASTERIZE_FLOAT const y_min = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
- SKC_RASTERIZE_FLOAT const y_max = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
- SKC_RASTERIZE_FLOAT const y_base = y_lt ? y_min : y_max;
- SKC_RASTERIZE_FLOAT y_segs = y_max - y_min;
-
- //
- // inclusive subgroup scan of y_segs
- //
- SKC_RASTERIZE_FLOAT y_iss = skc_subgroup_scan_inclusive_add_float(y_segs);
- SKC_RASTERIZE_FLOAT y_ess = y_iss - y_segs;
- float y_rem = skc_subgroup_last_float(y_iss);
-
- //
- // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails
- //
- if (y_segs == 0.0f)
- y_iss = 0.0f;
-
-#if 0
- printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem);
-#endif
-
- //
- // these values don't matter on first iteration
- //
- SKC_RASTERIZE_FLOAT n1x_prev = 0;
- SKC_RASTERIZE_FLOAT n1y_prev = 0;
-
- //
- // loop until done
- //
- while (y_rem > 0.0f)
- {
- //
- // distribute work across lanes
- //
- SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess);
-
- //
- // get line at y_source line
- //
- SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source);
- SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source);
- SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source);
- SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source);
-
- //
- // every lane will create a 1 pixel tall line "sliver"
- //
- // FIXME -- this gets expanded on SIMD
- //
- // if numerator == 1 then this is the first lane
- // if numerator == s then this is the last lane
- //
- SKC_RASTERIZE_FLOAT const y_delta = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source);
- SKC_RASTERIZE_FLOAT const y_count = skc_subgroup_shuffle(y_segs,y_source);
-
- SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f);
- SKC_RASTERIZE_PREDICATE const is_y_last = (y_delta >= y_count);
-
- // toggle y_delta sign
- SKC_RASTERIZE_FLOAT const y_offset = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source)));
-
- //
- // calculate "right" line segment endpoint
- //
- SKC_RASTERIZE_FLOAT n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP;
- SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source);
- SKC_RASTERIZE_FLOAT n1x = round(SKC_LERP(m0x,m1x,n_t));
-
- //
- // override c1 if this is last point
- //
- n1y = select(n1y,m1y,is_y_last);
- n1x = select(n1x,m1x,is_y_last);
-
- //
- // shuffle up "left" line segment endpoint
- //
- // NOTE: Intel's shuffle_up is unique with its elegant
- // "previous" argument so don't get used to it
- //
- SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y);
- SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x);
-
- //
- // override shuffle up if this is the first line segment
- //
- n0y = select(n0y,m0y,is_y_first);
- n0x = select(n0x,m0x,is_y_first);
-
- //
- // save previous right endpoint
- //
- n1x_prev = n1x;
- n1y_prev = n1y;
-
- //
- // decrement by subgroup size
- //
- y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
- y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
- y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-
-#if 0
- //
- // debug
- //
- if (n0y != n1y) {
- printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y);
- }
-#endif
-
- //
- // X-SLIVERING
- // -----------
- //
- // now sliver 1-pixel high lines into at either vertical or
- // 1-pixel wide lines
- //
- // save original direction and work with increasing x
- //
- SKC_RASTERIZE_PREDICATE const x_lt = (n0x <= n1x);
- SKC_RASTERIZE_UINT const dx_xor = x_lt ? 0 : 0x80000000;
-
- //
- // save 1/dy
- //
- SKC_RASTERIZE_FLOAT const x_denom = native_recip(n1x - n0x);
-
- //
- // how many non-horizontal subpixel y-axis slivers are there?
- //
- SKC_RASTERIZE_FLOAT const x_min = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
- SKC_RASTERIZE_FLOAT const x_max = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
- SKC_RASTERIZE_FLOAT const x_base = x_lt ? x_min : x_max;
- SKC_RASTERIZE_FLOAT const x_segs = fmax(x_max - x_min,1.0f);
-
- //
- // inclusive subgroup scan of y_segs
- //
- SKC_RASTERIZE_FLOAT x_iss = skc_subgroup_scan_inclusive_add_float(x_segs);
- SKC_RASTERIZE_FLOAT x_ess = x_iss - x_segs;
- float x_rem = skc_subgroup_last_float(x_iss);
-
- //
- // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails
- //
- //if (x_segs == 0.0f)
- // x_iss = 0.0f;
-
- //
- // these values don't matter on first iteration
- //
- SKC_RASTERIZE_FLOAT p1x_prev = 0;
- SKC_RASTERIZE_FLOAT p1y_prev = 0;
-
- //
- // loop until done
- //
- while (x_rem > 0)
- {
- //
- // distribute work across lanes
- //
- SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess);
-
- //
- // get line at y_source line
- //
- SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source);
- SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source);
- SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source);
- SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source);
-
- //
- // every lane will create a 1 pixel tall line "sliver"
- //
- // FIXME -- this gets expanded on SIMD
- //
- // if numerator == 1 then this is the first lane
- // if numerator == s then this is the last lane
- //
- SKC_RASTERIZE_FLOAT const x_delta = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source);
- SKC_RASTERIZE_FLOAT const x_count = skc_subgroup_shuffle(x_segs,x_source);
-
- SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f);
- SKC_RASTERIZE_PREDICATE const is_x_last = (x_delta >= x_count);
-
- // toggle x_delta sign
- SKC_RASTERIZE_FLOAT const x_offset = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source)));
-
- //
- // calculate "right" line segment endpoint
- //
- SKC_RASTERIZE_FLOAT p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP;
- SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source);
- SKC_RASTERIZE_FLOAT p1y = round(SKC_LERP(o0y,o1y,p_t));
-
- //
- // override c1 if this is last point
- //
- p1x = select(p1x,o1x,is_x_last);
- p1y = select(p1y,o1y,is_x_last);
-
- //
- // shuffle up "left" line segment endpoint
- //
- // NOTE: Intel's shuffle_up is unique with its elegant
- // "previous" argument so don't get used to it
- //
- SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x);
- SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y);
-
- //
- // override shuffle up if this is the first line segment
- //
- p0x = select(p0x,o0x,is_x_first);
- p0y = select(p0y,o0y,is_x_first);
-
- //
- // save previous right endpoint
- //
- p1x_prev = p1x;
- p1y_prev = p1y;
-
- //
- // decrement by subgroup size
- //
- x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
- x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
- x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-
- //
- // only non-horizontal subpixel lines are valid
- //
- SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y);
-
- //
- // if no lanes are active then continue
- //
- // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY
- // IMPACTS PERFORMANCE (+12% ?)
- //
- // IT SHOULDN'T !!!
- //
-#if 0
- if (!skc_subgroup_any(is_active))
- continue;
-#endif
-
- //
- // Option 1: use SLM for explicitly managed coalesced stores
- //
- // 1. which tile does this line belong?
- // 2. hash tile coordinates
- // 3. lookup hash
- // 4. if tile matches then SLM append keys
- // 5. if tile doesn't match
- // a. flush
- // b. create new TTSK_RYX
- // c. obtain TTSB block from pool
- // d. goto 3.
- //
-
- //
- // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores
- //
- // 1. which tile does this line belong?
- // 2. hash tile coordinates
- // 3. lookup hash
- // 4. if tile matches then GMEM append keys
- // 5. if tile doesn't match
- // a. flush (and invalidate empty elems)
- // b. create new TTSK_RYX
- // c. obtain TTSB block from pool
- // d. goto 3.
- //
-
- //
- // The virtual rasterization surface is very large and
- // signed: +/- ~64K-256K, depending on the architecture.
- //
- // Rasters must be clipped to the virtual surface and,
- // optionally, clipped even further on a per raster
- // basis.
- //
-
- //
- // Clip to the per-raster clip
- //
-
- /*
-
- CLIP HERE
-
- */
-
- //
- // Hash the tile coordinates
- //
- // This table lists nominal values for each architecture.
- // We want to choose values that are naturally fit the
- // "width" of the architecture.
- //
- // SIMD RANGE BITS MAX RANGE MAX BINS HASH BITS
- // ---- ------- ---- --------- -------- ---------
- // 4 [0, 4] 3 [0, 7] 10 mod(10) <-- SSE42, ?
- // 8 [0, 8] 4 [0, 15] 8 3 <-- GEN*,AVX*
- // 16 [0, 16] 5 [0, 31] 6 mod(6) <-- GEN*,?
- // 32 [0, 32] 6 [0, 63] 5 mod(5) <-- CUDA,PowerVR,Adreno,GEN*
- // 64 [0, 64] 7 [0,127] 4 2 <-- AMD Radeon
- //
- // NOTE: When possible, bias the hash toward using more y
- // bits because of:
- //
- // 1. the 90 degree counter-clockwise rotation that we put
- // in place to offset the render-time clockwise
- // rotation
- //
- // 2. the likely presence of left-to-right or
- // right-to-left glyphs.
- //
- // For power-of-two bins, the hash is easy.
- //
- // For non-power-of-two, we may want to either implement a
- // fast mod (compiler should do this for us... hahahaha) or
- // drop down to the next power-of-two.
- //
-
- //
- // FIXME -- this snarl is not good -- can probably reduce
- // some of the sign casting but some is there to vectorize a
- // scalar
- //
- SKC_RASTERIZE_INT const z0y = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y);
- SKC_RASTERIZE_INT const z1y = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y);
-
- SKC_RASTERIZE_INT const z0x = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x);
- SKC_RASTERIZE_INT const z1x = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x);
-
- SKC_RASTERIZE_INT const min_y = min(z0y,z1y);
- SKC_RASTERIZE_INT const max_y = max(z0y,z1y);
-
- SKC_RASTERIZE_INT const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2;
-
- SKC_RASTERIZE_UINT const ty = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y;
- SKC_RASTERIZE_INT dy = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y);
-
- //
- // map [+1,+32] to [ 0,+31]
- // map [-1,-32] to [-1,-32]
- //
- SKC_RASTERIZE_INT dys = (dy + (~dy >> 31)) << 26;
-
- SKC_RASTERIZE_INT const min_x = min(z0x,z1x);
- SKC_RASTERIZE_INT const max_x = max(z0x,z1x);
- SKC_RASTERIZE_INT const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2;
-
- SKC_RASTERIZE_UINT const tx = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X;
- SKC_RASTERIZE_UINT const sx = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x);
-
- SKC_RASTERIZE_UINT const tts = dys | (ty << 16) | (sx << 10) | tx;
-
- SKC_RASTERIZE_UINT const hash = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) |
- (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK));
-
- SKC_RASTERIZE_UINT const yx = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF));
-
-#if 0
- printf("(%3u, %3u)\n",tile_y,tile_x);
-#endif
-
-#if 0
- if (is_active)
- printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx);
-#endif
-
- //
- // debug
- //
-#if 0 // PRINTF_ENABLE
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) \
- if (is_active C) \
- printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C);
-
- SKC_RASTERIZE_VECTOR_EXPAND();
-#else
- if (is_active)
- printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash);
-#endif
-
-#endif
- //
- // flush all active lanes
- //
- while (true)
- {
- //
- // either gather load or vector load+shuffle the yx keys
- //
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
- SKC_RASTERIZE_BIN const yx_bin = smem->bin.vN.yx;
- SKC_RASTERIZE_UINT const yx_cur = shuffle(yx_bin,hash);
-#else
- SKC_RASTERIZE_UINT const yx_cur = smem->bin.aN.yx[hash];
-#endif
-
- //
- // does yx for lane match yx for hash?
- //
- SKC_RASTERIZE_UINT const active_yx = is_active ? yx : SKC_RASTERIZE_YX_INVALID;
- SKC_RASTERIZE_PREDICATE const is_match = (yx_cur == active_yx);
-
- //
- // OpenCL spec: "When casting a bool to a vector integer
- // data type, the vector components will be set to -1
- // (i.e. all bits set) if the vector bool value is true
- // and 0 otherwise.
- //
-#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
- SKC_RASTERIZE_UINT const h_match = (SKC_RASTERIZE_UINT)is_match;
-#else
- SKC_RASTERIZE_UINT const h_match = abs(is_match); // {-1,0} -> {+1,0}
-#endif
- //
- // how many new elements for each matching hash bin?
- //
- SKC_RASTERIZE_UINT const h_shl = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS;
- SKC_RASTERIZE_UINT const h = h_match << h_shl;
-
- //
- // prefix sum all of the bins in parallel
- //
- SKC_RASTERIZE_UINT const h_iss = skc_subgroup_scan_inclusive_add_uint(h);
- SKC_RASTERIZE_UINT const h_total = skc_subgroup_last_uint(h_iss);
-
- //
- // current bin counts
- //
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
- SKC_RASTERIZE_BIN const count_bin = smem->bin.vN.count;
- SKC_RASTERIZE_UINT const count_cur = shuffle(count_bin,hash);
-#else
- SKC_RASTERIZE_UINT const count_cur = smem->bin.aN.count[hash];
-#endif
-
- //
- // calculate where each cache-hit and in-bounds tts should be stored
- //
- SKC_RASTERIZE_UINT const ttsb_index = (h_iss >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1;
- SKC_RASTERIZE_UINT const count_new = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur;
-
- //
- // which lanes can append to a matching bin?
- //
- SKC_RASTERIZE_PREDICATE const is_append = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS);
-
- //
- // scatter append tts elements to bin blocks
- //
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
- //
- // SIMD
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) \
- if (is_append C) \
- { \
- smem->bin.aN.ttsb [hash C][ttsb_index C] = tts C; \
- smem->bin.aN.count[hash C] = count_new C; \
- }
-
- SKC_RASTERIZE_VECTOR_EXPAND();
-#else
- //
- // SIMT
- //
- if (is_append)
- {
- smem->bin.aN.ttsb [hash][ttsb_index] = tts;
- smem->bin.aN.count[hash] = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS
- }
-#endif
- //
- // try to keep predicate updates SIMD-friendly and
- // outside of predicated code paths -- this is not
- // always how we would normally do things on SIMT but
- // either approach is acceptable
- //
-
- //
- // mask off lanes/components that successfully appended
- //
- is_active = is_active && !is_append;
-
- //
- // are there any active lanes left?
- //
- if (!skc_subgroup_any(is_active))
- break;
-
- //
- // There are active lanes that couldn't be appended to a
- // bin because their hashes collided with the bin's
- // current ryx key then those bins must be ejected.
- //
- // Note that we do not eject "full" bins because lazily
- // waiting for a collision results in simpler code.
- //
- skc_flush(bp_atomics,
- bp_elems,
- bp_ids,
- bp_mask,
- cohort_atomics,
- subblocks,
- blocks,
- blocks_next,
- sk_v,
- sk_v_next,
- sk_extent,
- smem,
- hash,
- yx,
- is_active);
- }
- }
- }
-}
-
-//
-// INITIALIZE SMEM
-//
-// Note that SIMD/SIMT have nearly the same syntax.
-//
-static
-void
-skc_smem_init(__local struct skc_subgroup_smem volatile * const smem)
-{
- //
- // initialize smem bins
- //
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
- //
- // SIMD
- //
- smem->bin.vN.yx = ( SKC_RASTERIZE_YX_INIT );
- smem->bin.vN.count = ( 0 );
-#else
- //
- // SIMT
- //
- int idx = skc_subgroup_lane();
-
-#if ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
- if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT)
-#elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
- for (; idx<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; idx+=SKC_RASTERIZE_SUBGROUP_SIZE)
-#endif
- {
- smem->bin.aN.yx [idx] = ( SKC_RASTERIZE_YX_INIT );
- smem->bin.aN.count[idx] = ( 0 );
- }
-#endif
-}
-
-//
-// RASTERIZE CUBIC KERNEL
-//
-
-static
-void
-skc_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
- __global union skc_bp_elem * const bp_elems,
- __global uint * const bp_ids,
- skc_uint const bp_mask,
-
- __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
- __global skc_ttsk_s_t * const sk_extent,
-
- __local struct skc_subgroup_smem volatile * const smem,
-
- skc_uint * const nodeword,
- skc_block_id_t * const id,
-
- union skc_transform const * const tv,
- union skc_path_clip const * const cv,
- skc_uint const cohort)
-{
- //
- // the initial segment idx and segments-per-block constant determine
- // how many block ids will need to be loaded
- //
- SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
- skc_segment_next(bp_elems,nodeword,id);
-
- SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
- skc_segment_next(bp_elems,nodeword,id);
-
- SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
- skc_segment_next(bp_elems,nodeword,id);
-
- SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
- skc_segment_next(bp_elems,nodeword,id);
-
- SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
- skc_segment_next(bp_elems,nodeword,id);
-
- SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
- skc_segment_next(bp_elems,nodeword,id);
-
- SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
- skc_segment_next(bp_elems,nodeword,id);
-
- SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
- //
- // apply transform
- //
- // note that we only care if the end points are rounded to subpixel precision
- //
- // FIXME -- transformation is currently affine-only support perspective later
- //
- // the affine transformation requires 8 FMA + 2 ROUND operations
- //
- SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx);
- SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty);
-
- SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx + c1y * tv->shx + tv->tx;
- SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy + tv->ty;
-
- SKC_RASTERIZE_FLOAT const t2x = c2x * tv->sx + c2y * tv->shx + tv->tx;
- SKC_RASTERIZE_FLOAT const t2y = c2x * tv->shy + c2y * tv->sy + tv->ty;
-
- SKC_RASTERIZE_FLOAT const t3x = round(c3x * tv->sx + c3y * tv->shx + tv->tx);
- SKC_RASTERIZE_FLOAT const t3y = round(c3x * tv->shy + c3y * tv->sy + tv->ty);
-
- //
- //
- //
-#if PRINTF_ENABLE
-
-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) \
- printf("{ { %.02f, %.02f }, { %.02f, %.02f }," \
- " { %.02f, %.02f }, { %.02f, %.02f } },\n", \
- b0x C,b0y C,t1x C,t1y C, \
- t2x C,t2y C,t3x C,t3y C);
-
- SKC_RASTERIZE_VECTOR_EXPAND();
-
-#else
-
- printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n",
- b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);
-
-#endif
-
-#endif
-
- //
- // OLD APPROACH
- // ------------
- //
- // The Spinel CUDA rasterizer was significantly more complex and
- // performed a few different tasks that are probably best kept
- // separate.
- //
- // The Spinel rasterizer Bezier held 4-element x and y coordinates
- // in adjacent lanes. This simplified intermingling of single lane
- // 4-coordinate line segments with two-lane cubic Beziers.
- //
- // After transformation of the input segments, the Spinel rasterizer
- // would test cubics for flatness and, if flat, collapse the
- // adjacent lanes into a single line lane and an empty lane.
- //
- // Any lines would then be appended to a line queue.
- //
- // Any cubics would then be subdivided.
- //
- // The reclassification process would be repeated.
- //
- // NEW APPROACH
- // ------------
- //
- // Assume we're only working with cubics in this kernel.
- //
- // Optimization: if the line segment is a special case -- a cusp,
- // has 1+ inflections, or a loop -- it might be beneficial to
- // subdivide the control cage 1+ times in order to separate the
- // flatter segments the high-velocity region(s).
- //
- // This means we want to split using [a,b] formulation to _directly_
- // subdivide producing a new control cage.
- //
- // Wang's Formula is still useful even if we subdivide once or twice
- // as it's so cheap that it might give some useful hints about where
- // the high-velocity sections of curve reside.
- //
- // But it seems like using Wang's and directly flattening to line
- // segments without any subdivision is good enough for the limited
- // set of test cases that I've tried.
- //
- // So... use Wang's Formula to estimate how many line segment are
- // required to properly flatten the cubics.
- //
- // Then use inclusive/exclusive scans to put all the lanes to work:
- //
- // 1. segmenting cubics to line segments
- //
- // 2. slivering line segments into 1-pixel high line segments
- //
- // 3. slivering 1-pixel high line segments into 1-pixel wide line
- // segments
- //
- // MORE BACKGROUND ON NEW APPROACH
- // -------------------------------
- //
- // Two options for handling line segments:
- //
- // 1. append the line segments onto an SLM array until enough
- // work has been accrued (Spinel does this)
- //
- // 2. immediately sliver the potentially multi-pixel line
- // segments into subpixel lines
- //
- // The advantage of (1) is that it guarantees the slivering
- // process will, on average, always be emitting a full subgroup
- // of subpixel lines.
- //
- // The advantage of (2) is that it reduces code complexity and
- // leaves more room for SLM tile bins. The difference between Spinel
- // and Skia Compute is that Wang's Formula guarantees there will be
- // a full subgroup of multi-pixel lines unless this is the final
- // iteration of the warp of multi-pixel lines.
- //
- // Note that wider GPU architectures might benefit from (1) and
- // other work accumulation strategies because it will minimize
- // partial warp workloads in the final iteration of each stage. It
- // also minimizes the sunk cost of the uniform control logic steps.
- //
- // So let's implement (2) for now...
- //
-
- //
- // And... begin!
- //
- // Estimate how many line segments are in quad/cubic curve.
- //
- // Wang's Formula will return zero if the control points are
- // collinear but we bump it up to 1.0f.
- //
- SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);
-
- //
- // if there are free registers then precalculate the reciprocal for
- // each estimated segments since it will never change
- //
- SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);
-
-
- //
- // inclusive add scan of estimated line segments
- // exclusive add scan of estimated line segments
- // total number of estimated line segments
- //
- SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs);
- SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs;
- float s_rem = skc_subgroup_last_float(s_iss); // scalar
-
- //
- // Precompute cubic polynomial coefficients from transformed control
- // cage so we can shuffle them in on each iteration of the outer
- // loop and then evaluate the polynomial in Horner form.
- //
- // | 1 0 0 0 | | c0 |
- // | | | |
- // | -3 3 0 0 | | c1 |
- // B(t) = [ 1 t^1 t^2 t^3 ] | | | |
- // | 3 -6 3 0 | | c2 |
- // | | | |
- // | -1 3 -3 1 | | c3 |
- //
- //
- SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x); // 2 - 1 MAD + MUL
- SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y); // 2 - 1 MAD + MUL
-
- SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x)); // 3 - 2 MAD + MUL
- SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y)); // 3 - 2 MAD + MUL
-
- SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB
- SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB
-
- //
- // these values don't matter on the first iteration
- //
- SKC_RASTERIZE_FLOAT l1x_prev = 0;
- SKC_RASTERIZE_FLOAT l1y_prev = 0;
-
- //
- // allocate and init in-register TTSK keys
- //
- skc_uint sk_v_next = 0;
- skc_ttsk_v_t sk_v;
-
- sk_v.hi = cohort;
-
- //
- // initialize smem
- //
- skc_smem_init(smem);
-
- //
- // initialize blocks / subblocks
- //
- skc_block_id_v_t blocks;
- skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
-
-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
- skc_block_id_t subblocks = 0;
-#endif
-
- //
- // loop until done
- //
- while (s_rem > 0)
- {
- //
- // distribute work across lanes
- //
- SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);
-
- //
- // every lane has a fraction to work off of
- //
- // FIXME -- this gets expanded on SIMD
- //
- // if delta == 1 then this is the first lane
- // if count == s_segs then this is the last lane
- //
- SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
- SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source);
-
- SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
- SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count);
-
- //
- // init parametric t
- //
- SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?
-
- //
- // if last then override to a hard 1.0f
- //
- s_t = is_s_last ? 1.0f : s_t;
-
- //
- // decrement by subgroup size
- //
- s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
- s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
- s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-
- //
- // now every lane knows what to do and the following lines will
- // pump out up to SUBGROUP_SIZE line segments
- //
- // obtain the src vertices through shared or via a shuffle
- //
-
- //
- // shuffle in the polynomial coefficients their source lane
- //
- SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
- SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);
-
- SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
- SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);
-
- SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
- SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);
-
- SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source);
- SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source);
-
- //
- // calculate "right" line segment endpoint using Horner form
- //
- SKC_RASTERIZE_FLOAT l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND
- SKC_RASTERIZE_FLOAT l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND
-
- //
- // shuffle up "left" line segment endpoint
- //
- // NOTE: Intel's shuffle_up is unique with its elegant
- // "previous" argument so don't get used to it
- //
- SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
- SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);
-
- //
- // save previous right endpoint
- //
- l1x_prev = l1x;
- l1y_prev = l1y;
-
- //
- // override shuffle up if this is the first line segment
- //
- l0x = select(l0x,s0x,is_s_first);
- l0y = select(l0y,s0y,is_s_first);
-
- //
- // sliver lines
- //
- skc_sliver(bp_atomics,
- bp_elems,
- bp_ids,
- bp_mask,
- cohort_atomics,
- &subblocks,
- &blocks,
- &blocks_next,
- &sk_v,
- &sk_v_next,
- sk_extent,
- smem,
- l0x,l0y,l1x,l1y);
- }
-
- //
- // - flush work-in-progress blocks
- // - return unused block ids
- //
- skc_finalize(bp_atomics,
- bp_elems,
- bp_ids,
- bp_mask,
- cohort_atomics,
- &blocks,
- blocks_next,
- &sk_v,
- sk_v_next,
- sk_extent,
- smem);
-}
-
-//
-// RASTERIZE QUAD KERNEL
-//
-
-static
-void
-skc_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
- __global union skc_bp_elem * const bp_elems,
- __global uint * const bp_ids,
- skc_uint const bp_mask,
-
- __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
- __global skc_ttsk_s_t * const sk_extent,
-
- __local struct skc_subgroup_smem volatile * const smem,
-
- skc_uint * const nodeword,
- skc_block_id_t * const id,
-
- union skc_transform const * const tv,
- union skc_path_clip const * const cv,
- skc_uint const cohort)
-{
- //
- // the initial segment idx and segments-per-block constant determine
- // how many block ids will need to be loaded
- //
- SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
- skc_segment_next(bp_elems,nodeword,id);
-
- SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
- skc_segment_next(bp_elems,nodeword,id);
-
- SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
- skc_segment_next(bp_elems,nodeword,id);
-
- SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
- skc_segment_next(bp_elems,nodeword,id);
-
- SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
- skc_segment_next(bp_elems,nodeword,id);
-
- SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
- //
- // apply transform
- //
- // note that we only care if the end points are rounded to subpixel precision
- //
- // FIXME -- transformation is currently affine-only support perspective later
- //
- // the affine transformation requires 8 FMA + 2 ROUND operations
- //
- SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx);
- SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty);
-
- SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx + c1y * tv->shx + tv->tx;
- SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy + tv->ty;
-
- SKC_RASTERIZE_FLOAT const t2x = round(c2x * tv->sx + c2y * tv->shx + tv->tx);
- SKC_RASTERIZE_FLOAT const t2y = round(c2x * tv->shy + c2y * tv->sy + tv->ty);
-
- //
- // Estimate how many line segments are in quad/cubic curve.
- //
- // Wang's Formula will return zero if the control points are
- // collinear but we bump it up to 1.0f.
- //
- SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y);
-
- //
- // if there are free registers then precalculate the reciprocal for
- // each estimated segments since it will never change
- //
- SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);
-
-
- //
- // inclusive add scan of estimated line segments
- // exclusive add scan of estimated line segments
- // total number of estimated line segments
- //
- SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs);
- SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs;
- float s_rem = skc_subgroup_last_float(s_iss); // scalar
-
- //
- // Precompute quadratic polynomial coefficients from control cage so
- // we can shuffle them in on each iteration of the outer loop and
- // then evaluate the polynomial in Horner form.
- //
-
- // | 1 0 0 | | c0 |
- // | | | |
- // B(t) = [ 1 t^1 t^2 ] | -2 2 0 | | c1 |
- // | | | |
- // | 1 -2 1 | | c2 |
- //
- //
- SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL
- SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL
-
- SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x); // 2 - 1 MAD + ADD
- SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y); // 2 - 1 MAD + ADD
-
- //
- // these values don't matter on the first iteration
- //
- SKC_RASTERIZE_FLOAT l1x_prev = 0;
- SKC_RASTERIZE_FLOAT l1y_prev = 0;
-
- //
- // allocate and init in-register TTSK keys
- //
- skc_uint sk_v_next = 0;
- skc_ttsk_v_t sk_v;
-
- sk_v.hi = cohort;
-
- //
- // initialize smem
- //
- skc_smem_init(smem);
-
- //
- // initialize blocks / subblocks
- //
- skc_block_id_v_t blocks;
- skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
-
-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
- skc_block_id_t subblocks = 0;
-#endif
-
- //
- // loop until done
- //
- while (s_rem > 0)
- {
- //
- // distribute work across lanes
- //
- SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);
-
- //
- // every lane has a fraction to work off of
- //
- // FIXME -- this gets expanded on SIMD
- //
- // if delta == 1 then this is the first lane
- // if count == s_segs then this is the last lane
- //
- SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
- SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source);
-
- SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
- SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count);
-
- //
- // init parametric t
- //
- SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?
-
- //
- // if last then override to a hard 1.0f
- //
- s_t = is_s_last ? 1.0f : s_t;
-
- //
- // decrement by subgroup size
- //
- s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
- s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
- s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
-
- //
- // now every lane knows what to do and the following lines will
- // pump out up to SUBGROUP_SIZE line segments
- //
- // obtain the src vertices through shared or via a shuffle
- //
-
- //
- // shuffle in the polynomial coefficients their source lane
- //
- SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
- SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);
-
- SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
- SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);
-
- SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
- SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);
-
- //
- // calculate "right" line segment endpoint using Horner form
- //
- SKC_RASTERIZE_FLOAT l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND
- SKC_RASTERIZE_FLOAT l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND
-
- //
- // shuffle up "left" line segment endpoint
- //
- // NOTE: Intel's shuffle_up is unique with its elegant
- // "previous" argument so don't get used to it
- //
- SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
- SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);
-
- //
- // save previous right endpoint
- //
- l1x_prev = l1x;
- l1y_prev = l1y;
-
- //
- // override shuffle up if this is the first line segment
- //
- l0x = select(l0x,s0x,is_s_first);
- l0y = select(l0y,s0y,is_s_first);
-
- //
- // sliver lines
- //
- skc_sliver(bp_atomics,
- bp_elems,
- bp_ids,
- bp_mask,
- cohort_atomics,
- &subblocks,
- &blocks,
- &blocks_next,
- &sk_v,
- &sk_v_next,
- sk_extent,
- smem,
- l0x,l0y,l1x,l1y);
- }
-
- //
- // - flush work-in-progress blocks
- // - return unused block ids
- //
- skc_finalize(bp_atomics,
- bp_elems,
- bp_ids,
- bp_mask,
- cohort_atomics,
- &blocks,
- blocks_next,
- &sk_v,
- sk_v_next,
- sk_extent,
- smem);
-}
-
-//
-// RASTERIZE LINE KERNEL
-//
-
-static
-void
-skc_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
- __global union skc_bp_elem * const bp_elems,
- __global uint * const bp_ids,
- skc_uint const bp_mask,
-
- __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
- __global skc_ttsk_s_t * const sk_extent,
-
- __local struct skc_subgroup_smem volatile * const smem,
-
- skc_uint * const nodeword,
- skc_block_id_t * const id,
-
- union skc_transform const * const tv,
- union skc_path_clip const * const cv,
- skc_uint const cohort)
-{
- //
- // the initial segment idx and segments-per-block constant determine
- // how many block ids will need to be loaded
- //
- SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
- skc_segment_next(bp_elems,nodeword,id);
-
- SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
- skc_segment_next(bp_elems,nodeword,id);
-
- SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
- skc_segment_next(bp_elems,nodeword,id);
-
- SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
-
-#if 0
- // printf("%5u : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",(skc_uint)get_global_id(0),c0x,c0y,c1x,c1y);
- printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y);
-#endif
-
- //
- // apply transform
- //
- // note that we only care if the end points are rounded to subpixel precision
- //
- // FIXME -- transformation is currently affine-only
- // FIXME -- support perspective later
- //
- // the affine transformation requires 8 FMA + 4 ROUND operations
- //
- SKC_RASTERIZE_FLOAT const l0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx);
- SKC_RASTERIZE_FLOAT const l0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty);
-
- SKC_RASTERIZE_FLOAT const l1x = round(c1x * tv->sx + c1y * tv->shx + tv->tx);
- SKC_RASTERIZE_FLOAT const l1y = round(c1x * tv->shy + c1y * tv->sy + tv->ty);
-
-#if 0
- printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y);
-#endif
-
- //
- // allocate and init in-register TTSK keys
- //
- skc_uint sk_v_next = 0;
- skc_ttsk_v_t sk_v;
-
- sk_v.hi = cohort;
-
- //
- // initialize smem
- //
- skc_smem_init(smem);
-
- //
- // initialize blocks / subblocks
- //
- skc_block_id_v_t blocks;
- skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
-
-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
- skc_block_id_t subblocks = 0;
-#endif
-
- //
- // sliver lines
- //
- skc_sliver(bp_atomics,
- bp_elems,
- bp_ids,
- bp_mask,
- cohort_atomics,
- &subblocks,
- &blocks,
- &blocks_next,
- &sk_v,
- &sk_v_next,
- sk_extent,
- smem,
- l0x,l0y,l1x,l1y);
-
- //
- // - flush work-in-progress blocks
- // - return unused block ids
- //
- skc_finalize(bp_atomics,
- bp_elems,
- bp_ids,
- bp_mask,
- cohort_atomics,
- &blocks,
- blocks_next,
- &sk_v,
- sk_v_next,
- sk_extent,
- smem);
-}
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
- __global union skc_bp_elem * const bp_elems,
- __global uint * const bp_ids,
- skc_uint const bp_mask,
-
- __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
- __global skc_ttsk_s_t * const sk_extent,
-
- __global float8 const * const transforms, // FIXME -- __constant
- __global float4 const * const clips, // FIXME -- __constant
- __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant
- skc_uint const count)
-{
- //
- // declare shared memory block
- //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
- __local struct skc_subgroup_smem volatile smem[1];
-#else
- __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
- __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
-#endif
-
- //
- // this is a subgroup/warp-centric kernel
- //
- // which subgroup in the grid is this?
- //
- // TAKE NOTE: the Intel GEN compiler appears to be recognizing
- // get_group_id(0) as a uniform but the alternative calculation used
- // when there are multiple subgroups per workgroup is not
- // cooperating and driving spillage elsewhere.
- //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
- uint const cmd_idx = get_group_id(0);
-#else
- uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
-#if 0
- if (get_sub_group_local_id() == 0)
- printf("+cmd_idx = %u\n",cmd_idx);
-#endif
-
- //
- // if worksgroups are multi-subgroup then there may be excess
- // subgroups in the final workgroup
- //
- if (cmd_idx >= count)
- return;
-
-#if 0
- if (get_sub_group_local_id() == 0)
- printf("-cmd_idx = %u\n",cmd_idx);
-#endif
-
- //
- // load a single command for this subgroup
- //
- union skc_cmd_rasterize const cmd = cmds[cmd_idx];
-
-#if 0
- if (get_sub_group_local_id() == 0)
- printf("[ %u ]< %u, %u, %u, %u >\n",
- cmd_idx,
- cmd.nodeword,
- SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd),
- SKC_CMD_RASTERIZE_GET_CLIP(cmd),
- SKC_CMD_RASTERIZE_GET_COHORT(cmd));
-#endif
-
- //
- // get first block node command word and its subblock
- //
- skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing
- skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id;
- skc_block_id_tag tag = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id);
- skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
-
- //
- // load transform -- uniform across subgroup
- //
- // v8: { sx shx tx shy sy ty w0 w1 }
- //
- // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
- //
- // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
- //
- // Coordinates are scaled to subpixel resolution. All that matters
- // is that continuity is maintained between end path element
- // endpoints.
- //
- // It's the responsibility of the host to ensure that the transforms
- // are properly scaled either via intitializing a transform stack
- // with the subpixel resolution scaled identity or scaling the
- // transform before its loaded by a rasterization grid.
- //
- // FIXME -- horizontal load might be better than this broadcast load
- //
- union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
- union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load
- skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
-
- switch (tag)
- {
- case SKC_BLOCK_ID_TAG_PATH_LINE:
- skc_rasterize_lines(bp_atomics,
- bp_elems,
- bp_ids,
- bp_mask,
- cohort_atomics,
- sk_extent,
- smem,
- &nodeword,&id,
- &tv,&cv,cohort);
- break;
-
- case SKC_BLOCK_ID_TAG_PATH_QUAD:
- skc_rasterize_quads(bp_atomics,
- bp_elems,
- bp_ids,
- bp_mask,
- cohort_atomics,
- sk_extent,
- smem,
- &nodeword,&id,
- &tv,&cv,cohort);
- break;
-
- case SKC_BLOCK_ID_TAG_PATH_CUBIC:
- skc_rasterize_cubics(bp_atomics,
- bp_elems,
- bp_ids,
- bp_mask,
- cohort_atomics,
- sk_extent,
- smem,
- &nodeword,&id,
- &tv,&cv,cohort);
- break;
-
- case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD:
- break;
- case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC:
- break;
-
- default:
- break;
- }
-}
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
- __global union skc_bp_elem * const bp_elems,
- __global uint * const bp_ids,
- skc_uint const bp_mask,
-
- __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
- __global skc_ttsk_s_t * const sk_extent,
-
- __global float8 const * const transforms, // FIXME -- __constant
- __global float4 const * const clips, // FIXME -- __constant
- __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant
- skc_uint const count)
-{
- //
- // declare shared memory block
- //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
- __local struct skc_subgroup_smem volatile smem[1];
-#else
- __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
- __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
-#endif
-
- //
- // this is a subgroup/warp-centric kernel
- //
- // which subgroup in the grid is this?
- //
- // TAKE NOTE: the Intel GEN compiler appears to be recognizing
- // get_group_id(0) as a uniform but the alternative calculation used
- // when there are multiple subgroups per workgroup is not
- // cooperating and driving spillage elsewhere.
- //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
- uint const cmd_idx = get_group_id(0);
-#else
- uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
- //
- // if worksgroups are multi-subgroup then there may be excess
- // subgroups in the final workgroup
- //
- if (cmd_idx >= count)
- return;
-
-#if 0
- if (get_sub_group_local_id() == 0)
- printf("cmd_idx = %u\n",cmd_idx);
-#endif
-
- //
- // load a single command for this subgroup
- //
- union skc_cmd_rasterize const cmd = cmds[cmd_idx];
-
- //
- // get first block node command word and its subblock
- //
- skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing
- skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id;
- skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
-
- //
- // load transform -- uniform across subgroup
- //
- // v8: { sx shx tx shy sy ty w0 w1 }
- //
- // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
- //
- // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
- //
- // Coordinates are scaled to subpixel resolution. All that matters
- // is that continuity is maintained between end path element
- // endpoints.
- //
- // It's the responsibility of the host to ensure that the transforms
- // are properly scaled either via intitializing a transform stack
- // with the subpixel resolution scaled identity or scaling the
- // transform before its loaded by a rasterization grid.
- //
- // FIXME -- horizontal load might be better than this broadcast load
- //
- union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
- union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load
- skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
-
- skc_rasterize_lines(bp_atomics,
- bp_elems,
- bp_ids,
- bp_mask,
- cohort_atomics,
- sk_extent,
- smem,
- &nodeword,&id,
- &tv,&cv,cohort);
-}
-
-//
-//
-//
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
- __global union skc_bp_elem * const bp_elems,
- __global uint * const bp_ids,
- skc_uint const bp_mask,
-
- __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
- __global skc_ttsk_s_t * const sk_extent,
-
- __global float8 const * const transforms, // FIXME -- __constant
- __global float4 const * const clips, // FIXME -- __constant
- __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant
- skc_uint const count)
-{
- //
- // declare shared memory block
- //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
- __local struct skc_subgroup_smem volatile smem[1];
-#else
- __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
- __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
-#endif
-
- //
- // this is a subgroup/warp-centric kernel
- //
- // which subgroup in the grid is this?
- //
- // TAKE NOTE: the Intel GEN compiler appears to be recognizing
- // get_group_id(0) as a uniform but the alternative calculation used
- // when there are multiple subgroups per workgroup is not
- // cooperating and driving spillage elsewhere.
- //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
- uint const cmd_idx = get_group_id(0);
-#else
- uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
- //
- // if worksgroups are multi-subgroup then there may be excess
- // subgroups in the final workgroup
- //
- if (cmd_idx >= count)
- return;
-
-#if 0
- if (get_sub_group_local_id() == 0)
- printf("cmd_idx = %u\n",cmd_idx);
-#endif
-
- //
- // load a single command for this subgroup
- //
- union skc_cmd_rasterize const cmd = cmds[cmd_idx];
-
- //
- // get first block node command word and its subblock
- //
- skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing
- skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id;
- skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
-
- //
- // load transform -- uniform across subgroup
- //
- // v8: { sx shx tx shy sy ty w0 w1 }
- //
- // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
- //
- // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
- //
- // Coordinates are scaled to subpixel resolution. All that matters
- // is that continuity is maintained between end path element
- // endpoints.
- //
- // It's the responsibility of the host to ensure that the transforms
- // are properly scaled either via intitializing a transform stack
- // with the subpixel resolution scaled identity or scaling the
- // transform before its loaded by a rasterization grid.
- //
- // FIXME -- horizontal load might be better than this broadcast load
- //
- union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
- union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load
- skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
-
- skc_rasterize_quads(bp_atomics,
- bp_elems,
- bp_ids,
- bp_mask,
- cohort_atomics,
- sk_extent,
- smem,
- &nodeword,&id,
- &tv,&cv,cohort);
-}
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
- __global union skc_bp_elem * const bp_elems,
- __global uint * const bp_ids,
- skc_uint const bp_mask,
-
- __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
- __global skc_ttsk_s_t * const sk_extent,
-
- __global float8 const * const transforms, // FIXME -- __constant
- __global float4 const * const clips, // FIXME -- __constant
- __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant
- skc_uint const count)
-{
- //
- // declare shared memory block
- //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
- __local struct skc_subgroup_smem volatile smem[1];
-#else
- __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
- __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
-#endif
-
- //
- // this is a subgroup/warp-centric kernel
- //
- // which subgroup in the grid is this?
- //
- // TAKE NOTE: the Intel GEN compiler appears to be recognizing
- // get_group_id(0) as a uniform but the alternative calculation used
- // when there are multiple subgroups per workgroup is not
- // cooperating and driving spillage elsewhere.
- //
-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
- uint const cmd_idx = get_group_id(0);
-#else
- uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
- //
- // if worksgroups are multi-subgroup then there may be excess
- // subgroups in the final workgroup
- //
- if (cmd_idx >= count)
- return;
-
-#if 0
- if (get_sub_group_local_id() == 0)
- printf("cmd_idx = %u\n",cmd_idx);
-#endif
-
- //
- // load a single command for this subgroup
- //
- union skc_cmd_rasterize const cmd = cmds[cmd_idx];
-
- //
- // get first block node command word and its subblock
- //
- skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing
- skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id;
- skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
-
- //
- // load transform -- uniform across subgroup
- //
- // v8: { sx shx tx shy sy ty w0 w1 }
- //
- // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
- //
- // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
- //
- // Coordinates are scaled to subpixel resolution. All that matters
- // is that continuity is maintained between end path element
- // endpoints.
- //
- // It's the responsibility of the host to ensure that the transforms
- // are properly scaled either via intitializing a transform stack
- // with the subpixel resolution scaled identity or scaling the
- // transform before its loaded by a rasterization grid.
- //
- // FIXME -- horizontal load might be better than this broadcast load
- //
- union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
- union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load
- skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
-
- skc_rasterize_cubics(bp_atomics,
- bp_elems,
- bp_ids,
- bp_mask,
- cohort_atomics,
- sk_extent,
- smem,
- &nodeword,&id,
- &tv,&cv,cohort);
-}
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
- __global union skc_bp_elem * const bp_elems,
- __global uint * const bp_ids,
- skc_uint const bp_mask,
-
- __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
- __global skc_ttsk_s_t * const sk_extent,
-
- __global float8 const * const transforms, // FIXME -- __constant
- __global float4 const * const clips, // FIXME -- __constant
- __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant
- skc_uint const count)
-{
- ;
-}
-
-//
-//
-//
-
-__kernel
-SKC_RASTERIZE_KERNEL_ATTRIBS
-void
-skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
- __global union skc_bp_elem * const bp_elems,
- __global uint * const bp_ids,
- skc_uint const bp_mask,
-
- __global SKC_ATOMIC_UINT volatile * const cohort_atomics,
- __global skc_ttsk_s_t * const sk_extent,
-
- __global float8 const * const transforms, // FIXME -- __constant
- __global float4 const * const clips, // FIXME -- __constant
- __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant
- skc_uint const count)
-{
- ;
-}
-
-//
-//
-//
+/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "tile.h" +#include "common.h" +#include "atomic_cl.h" +#include "block_pool_cl.h" +#include "raster_builder_cl_12.h" +#include "kernel_cl_12.h" + +// #define SKC_ARCH_AVX2 +// #define SKC_RASTERIZE_SIMD_USES_SMEM + +#define PRINTF_ENABLE 0 +#define PRINTF_BLOCK_COUNT 0 + +// +// NOTE: +// +// ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT +// AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE +// +// NOTE: +// +// IGNORE FOR NOW ANY AVX2 CODE SNIPPETS. THEY WILL BE MOVED ASAP. +// +// + +#if 0 // SKC_ARCH_AVX2 + +// #define SKC_RASTERIZE_SUBGROUP_SIZE 1 +// #define SKC_RASTERIZE_VECTOR_SIZE_LOG2 3 +// #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP 1 + +// #define SKC_TTXB_WORDS 8 + +// #define SKC_RASTERIZE_FLOAT float8 +// #define SKC_RASTERIZE_UINT uint8 +// #define SKC_RASTERIZE_INT int8 +// #define SKC_RASTERIZE_PREDICATE int8 + +// #define SKC_RASTERIZE_BIN_BLOCK uint16 +// #define SKC_RASTERIZE_BIN uint8 + +// #define SKC_RASTERIZE_POOL uint8 +// #define SKC_RASTERIZE_POOL_SCALE 6 + +// #define SKC_RASTERIZE_TILE_HASH_X_BITS 1 +// #define SKC_RASTERIZE_TILE_HASH_Y_BITS 2 + +// #define SKC_RASTERIZE_VECTOR_EXPAND() SKC_EXPAND_8() + +#endif + +// +// SIMT +// + +#define SKC_RASTERIZE_BLOCK_ID_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE +#define SKC_RASTERIZE_TTSK_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE +#define SKC_RASTERIZE_TTSK_V_MASK (SKC_RASTERIZE_TTSK_V_SIZE - 1) + +// +// +// + +#define SKC_RASTERIZE_VECTOR_SIZE (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2) +#define SKC_RASTERIZE_ELEMS_PER_SUBGROUP (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE) + +// +// +// + +#define SKC_RASTERIZE_YX_INIT 0x7FFF7FFF // { +32767, +32767 } +#define SKC_RASTERIZE_YX_INVALID 0x80008000 // { -32768, -32768 } + +// +// +// + +#define SKC_RASTERIZE_TILE_HASH_X_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS) +#define SKC_RASTERIZE_TILE_HASH_Y_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS) +#define SKC_RASTERIZE_TILE_HASH_BITS (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS) +#define SKC_RASTERIZE_TILE_HASH_BIN_COUNT (1 << SKC_RASTERIZE_TILE_HASH_BITS) +#define SKC_RASTERIZE_TILE_HASH_BIN_BITS (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT) +#define SKC_RASTERIZE_TILE_HASH_BIN_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS) + +// +// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++" +// +// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/ +// +// Lerp in two fma/mad ops: +// +// t * b + ((-t) * a + a) +// +// Note: OpenCL documents mix() as being implemented as: +// +// a + (b - a) * t +// +// But this may be a native instruction on some devices. For example, +// on GEN9 there is an LRP "linear interoplation" opcode but it +// doesn't appear to support half floats. +// +// Feel free to toggle this option and then benchmark and inspect the +// generated code. We really want the double FMA to be generated when +// there isn't support for a LERP/MIX operation. +// + +#if 1 +#define SKC_LERP(a,b,t) mad(t,b,mad(-(t),a,a)) +#else +#define SKC_LERP(a,b,t) mix(a,b,t) +#endif + +// +// There is no integer MAD in OpenCL with "don't care" overflow +// semantics. +// +// FIXME -- verify if the platform needs explicit MAD operations even +// if a "--fastmath" option is available at compile time. It might +// make sense to explicitly use MAD calls if the platform requires it. +// + +#if 1 +#define SKC_MAD_UINT(a,b,c) ((a) * (b) + (c)) +#else +#define SKC_MAD_UINT(a,b,c) mad_sat(a,b,c) +#endif + +// +// +// + +#define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()) + +// +// +// + +union skc_bp_elem +{ + skc_uint u32; + skc_tagged_block_id_t tag_id; + skc_float coord; +}; + +// +// +// + +struct skc_subgroup_smem +{ + // + // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member + // +#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM ) + struct { + union { + + skc_uint winner; + + struct { + skc_uint scratch[SKC_RASTERIZE_SUBGROUP_SIZE]; + } aN; + + struct { + SKC_RASTERIZE_UINT scratch[SKC_RASTERIZE_SUBGROUP_SIZE]; + } vN; + }; + } subgroup; +#endif + + // + // work-in-progress TTSB blocks and associated YX keys + // + union { + struct { + // FIXME -- some typedefs are valid here + skc_uint ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS]; + skc_uint yx [SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; + skc_uint id [SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; + skc_uint count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; + } aN; +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + struct { + SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; + SKC_RASTERIZE_BIN yx; + SKC_RASTERIZE_BIN id; + SKC_RASTERIZE_BIN count; + } vN; +#endif + } bin; +}; + +// +// +// + +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) +#define skc_subgroup_lane() 0 +#else +#define skc_subgroup_lane() get_sub_group_local_id() +#endif + +// +// replenish block ids +// +// note that you can't overrun the block id pool since it's a ring +// + +static +void +skc_blocks_replenish(skc_uint * const blocks_next, + skc_block_id_v_t * const blocks, + __global SKC_ATOMIC_UINT volatile * const bp_atomics, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const bp_ids) +{ + // + // get a new vector of block ids -- this is kind of a narrow + // allocation but subblocks help stretch out the pool. + // + // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids + // + skc_uint bp_idx = 0; + + if (skc_subgroup_lane() == 0) + { + bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS, + SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads +#if 0 + printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE); +#endif + } + + bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask; + *blocks = bp_ids[bp_idx]; + *blocks_next = 0; +} + +// +// +// + +static +skc_block_id_t +skc_blocks_get_next(skc_uint * const blocks_next, + skc_block_id_v_t * const blocks, + __global SKC_ATOMIC_UINT volatile * const bp_atomics, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const bp_ids) +{ + // replenish? + if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE) + { + skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids); + } + +#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) + // + // SIMT + // + skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next); + +#else + // + // SIMD + // + skc_block_id_t id = blocks->s0; + + skc_shuffle_down_1(*blocks); + +#endif + + *blocks_next += 1; + + return id; +} + +// +// subblock allocator +// + +#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 + +static +skc_block_id_t +skc_subblocks_get_next(skc_block_id_t * const subblocks, + skc_uint * const blocks_next, + skc_block_id_v_t * const blocks, + __global SKC_ATOMIC_UINT volatile * const bp_atomics, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const bp_ids) +{ + if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) + { + *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids); + } + + skc_block_id_t const sb_id = *subblocks; + + *subblocks += 1; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("= %u\n",sb_id); +#endif + + return sb_id; +} + + +#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks +#define SKC_SUBBLOCKS_BLOCKS_ARGS() subblocks, blocks + +#else + +#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks +#define SKC_SUBBLOCKS_BLOCKS_ARGS() blocks + +#endif + +// +// +// + +static +skc_block_id_t +skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(), + skc_uint * const blocks_next, + __global SKC_ATOMIC_UINT volatile * const bp_atomics, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const bp_ids, + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + skc_ttsk_v_t * const sk_v, + skc_uint * const sk_v_next, + __global skc_ttsk_s_t * const sk_extent, + skc_uint const new_yx) +{ +#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 + skc_block_id_t const new_id = skc_subblocks_get_next(subblocks, + blocks_next, + blocks, + bp_atomics, + bp_mask, + bp_ids); +#else + skc_block_id_t const new_id = skc_blocks_get_next(blocks_next, + blocks, + bp_atomics, + bp_mask, // pow2 modulo mask for block pool ring + bp_ids); +#endif + + if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK)) + { + sk_v->lo = new_id; + sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx; +#if 0 + printf("@ ( %3u, %3u ) %u\n", + (new_yx >> 12) & 0xFFF, + (new_yx ) & 0xFFF, + new_id); +#endif + } + + *sk_v_next += 1; + + if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE) + { + *sk_v_next = 0; + + skc_uint sk_idx = 0; + + if (skc_subgroup_lane() == 0) + { + sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE + (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE); +#if 0 + printf("+ %u\n",sk_idx); +#endif + } + + sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane(); + +#if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE ) + if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE) +#endif + { + sk_extent[sk_idx] = *sk_v; +#if 0 + printf("> %u : %v2u\n",sk_idx,*sk_v); +#endif + } + } + + return new_id; +} + +// +// +// + +static +SKC_RASTERIZE_FLOAT +skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v) +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // + // Note that there isn't a built-in horizontal scan for vectors so + // we'll define some here for various widths. + // + // FIXME -- a scalar version might be faster so put in a + // compile-time switch to selection between implementations + // + +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return v; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) + // 01 + // 0 + + // -- + // 01 + SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v); + return w; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) + // 0123 + // 012 + + // ---- + // 0123 + // 01 + + // ---- + // 0123 + // + SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v); + SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w); + return x; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) + // 01234567 + // 0123456 + + // -------- + // 01234567 + // 012345 + + // -------- + // 01234567 + // 0123 + + // -------- + // 01234567 + // + SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v); + SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w); + SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x); + return y; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) + // 0123456789abcdef + // 0123456789abcde + + // ---------------- + // 0123456789abcdef + // 0123456789abcd + + // ---------------- + // 0123456789abcdef + // 0123456789ab + + // ---------------- + // 0123456789abcdef + // 01234567 + + // ---------------- + // 0123456789abcdef + // + SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v); + SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w); + SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x); + SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y); + return z; + +#endif + +#else + // + // SIMT + // + + return sub_group_scan_inclusive_add(v); + +#endif +} + +// +// +// + +static +SKC_RASTERIZE_UINT +skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v) +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // + // Note that there isn't a built-in horizontal scan for vectors so + // we'll define some here for various widths. + // + // FIXME -- a scalar version might be faster so put in a + // compile-time switch to selection between implementations + // + +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return v; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) + // 01 + // 0 + + // -- + // 01 + SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v); + return w; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) + // 0123 + // 012 + + // ---- + // 0123 + // 01 + + // ---- + // 0123 + // + SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v); + SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w); + return x; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) + // 01234567 + // 0123456 + + // -------- + // 01234567 + // 012345 + + // -------- + // 01234567 + // 0123 + + // -------- + // 01234567 + // + SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v); + SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w); + SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x); + return y; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) + // 0123456789abcdef + // 0123456789abcde + + // ---------------- + // 0123456789abcdef + // 0123456789abcd + + // ---------------- + // 0123456789abcdef + // 0123456789ab + + // ---------------- + // 0123456789abcdef + // 01234567 + + // ---------------- + // 0123456789abcdef + // + SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v); + SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w); + SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x); + SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y); + return z; + +#endif + +#else + // + // SIMT + // + + return sub_group_scan_inclusive_add(v); + +#endif +} + +// +// +// + +static +SKC_RASTERIZE_UINT +skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v) +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // + // Note that there isn't a built-in horizontal scan for vectors so + // we'll define some here for various widths. + // + // FIXME -- a scalar version might be faster so put in a + // compile-time switch to selection between implementations + // + +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return v; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) + // 01 + // 00 max + // -- + // 01 + SKC_RASTERIZE_UINT const w = max(v.s00,v); + return w; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) + // 0123 + // 0012 + + // ---- + // 0123 + // 0101 + + // ---- + // 0123 + // + SKC_RASTERIZE_UINT const w = max(v.s0012,v); + SKC_RASTERIZE_UINT const x = max(w.s0101,w); + return x; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) + // 01234567 + // 00123456 + + // -------- + // 01234567 + // 01012345 + + // -------- + // 01234567 + // 01230123 + + // -------- + // 01234567 + // + SKC_RASTERIZE_UINT const w = max(v.s00123456,v); + SKC_RASTERIZE_UINT const x = max(w.s01012345,w); + SKC_RASTERIZE_UINT const y = max(x.s01230123,x); + return y; + +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) + // 0123456789abcdef + // 00123456789abcde + + // ---------------- + // 0123456789abcdef + // 010123456789abcd + + // ---------------- + // 0123456789abcdef + // 01230123456789ab + + // ---------------- + // 0123456789abcdef + // 0123456701234567 + + // ---------------- + // 0123456789abcdef + // + SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v); + SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w); + SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x); + SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y); + return z; + +#endif + +#else + // + // SIMT + // + + return sub_group_scan_inclusive_max(v); + +#endif +} + +// +// +// + +static +float +skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v) +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return v; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) + return v.s1; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) + return v.s3; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) + return v.s7; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) + return v.sf; +#endif + +#else + // + // SIMT + // + return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1); + +#endif +} + +// +// +// + +static +SKC_RASTERIZE_UINT +skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v) +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return v; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) + return v.s1; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) + return v.s3; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) + return v.s7; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) + return v.sf; +#endif + +#else + // + // SIMT + // + return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1); + +#endif +} + +// +// +// + +static +float +skc_subgroup_first(SKC_RASTERIZE_FLOAT const v) +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return v; +#else + return v.s0; +#endif + +#else + // + // SIMT + // + return sub_group_broadcast(v,0); + +#endif +} + +// +// +// + +static +SKC_RASTERIZE_FLOAT +skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v, + SKC_RASTERIZE_UINT const i) +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return v; +#else + return shuffle(v,i); +#endif + +#else + // + // SIMT + // + return intel_sub_group_shuffle(v,i); + +#endif +} + +// +// +// + +static +SKC_RASTERIZE_FLOAT +skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous + SKC_RASTERIZE_FLOAT const c) // current +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // + // FIXME -- there are alternative formulations here: + // + // Option 1: + // + // select(c.rotate(+1),p.rotate(-1),(1,0,0,...)) + // + // Option 2: + // + // p is a scalar + // t = c.rotate(+1) + // t.s0 = p; + // + // Option 3: ... + // +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return p; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) + return shuffle2(p,c,(uint2)(1,2)); +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) + return shuffle2(p,c,(uint4)(3,4,5,6)); +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) + return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14)); +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) + return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30)); +#endif + +#else + // + // SIMT + // + return intel_sub_group_shuffle_up(p,c,1); + +#endif +} + +// +// +// + +static +bool +skc_is_lane_first() +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1) + // + // SIMD + // + return true; +#else + // + // SIMT + // + return get_sub_group_local_id() == 0; +#endif +} + +// +// +// + +static +SKC_RASTERIZE_FLOAT +skc_delta_offset() +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + return 1; +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) + return (SKC_RASTERIZE_FLOAT)( 1, 2 ); +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) + return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 ); +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) + return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 ); +#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) + return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ); +#endif + +#else + // + // SIMT + // + return 1.0f + get_sub_group_local_id(); + +#endif + +} + +// +// +// + +static +int +skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p) +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // + return any(p); +#else + // + // SIMT + // + return sub_group_any(p); +#endif +} + +// +// +// + +#define SKC_PATH_NODEWORD_IS_LAST(n) (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK) + +void +skc_segment_next(__global union skc_bp_elem * const bp_elems, + skc_uint * const nodeword, + skc_block_id_t * const id) +{ + if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) + { + if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword)) + { + *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS; + } + + skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id; + + *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); + } +} + +// +// +// + +static +SKC_RASTERIZE_FLOAT +skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y) +{ + return native_sqrt(x * x + y * y); +} + +// +// Wang's Formula (1985) +// + +#define SKC_WANG_PIXEL_RESL 0.25f // <-- this can be tuned + +#define SKC_WANG_EPSILON (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32) + +#define SKC_WANG_CUBIC ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON)) +#define SKC_WANG_QUADRATIC ((2.0f ) / (8.0f * SKC_WANG_EPSILON)) + +#define SKC_WANG_LENGTH(x,y) skc_native_length(x,y) +#define SKC_WANG_SQRT(x) native_sqrt(x) + +// +// +// + +static +SKC_RASTERIZE_FLOAT +skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y, + SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y, + SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y, + SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y) +{ + // + // Return the number of evenly spaced (in the parametric sense) line + // segments that are guaranteed to be within "epsilon" error of the + // curve. + // + // We're then going to take multiples of the reciprocal of this + // number so that the segmentation can be distributed across the + // subgroup. + // + // Note, this can probably be slightly optimized per architecture + // but it's probably far from being a hotspot since it's all + // straight-line unpredicated code. + // + // The result is an integer ranging from [1.0,#segments] + // + // Note that even if all of the control points are coincident, the + // max(1.0f) will categorize this as a line of 1 segment. + // + // This is what we want! We want to convert cubics to lines as + // easily as possible and *then* cull lines that are either + // horizontal or zero length. + // + return max(1.0f, + ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC * + SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x), + fabs(t3x - 2.0f * t2x + t1x)), + max(fabs(t2y - 2.0f * t1y + t0y), + fabs(t3y - 2.0f * t2y + t1y)))))); +} + +static +SKC_RASTERIZE_FLOAT +skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y, + SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y, + SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y) +{ + return max(1.0f, + ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC * + SKC_WANG_LENGTH(fabs(t2x - 2.0f * t1x + t0x), + fabs(t2y - 2.0f * t1y + t0y))))); +} + +// +// rational curves +// + +static +SKC_RASTERIZE_FLOAT +skc_wangs_formula_cubic_rat() +{ + return 0.0f; +} + +static +SKC_RASTERIZE_FLOAT +skc_wangs_formula_quad_rat() +{ + return 0.0f; +} + +// +// flush any work-in-progress blocks and return unused block ids +// + +static +void +skc_finalize(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + skc_block_id_v_t * const blocks, + skc_uint const blocks_next, + skc_ttsk_v_t * const sk_v, + skc_uint const sk_v_next, + __global skc_ttsk_s_t * const sk_extent, + __local struct skc_subgroup_smem volatile * const smem) +{ + // + // flush non-empty bins + // + // FIXME -- accelerate this iteration/search with a subgroup operation + // + for (skc_uint ii=0; ii<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; ii++) + { + if (smem->bin.aN.count[ii] > 0) + { + skc_block_id_v_t const id = smem->bin.aN.id[ii]; + skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); + skc_uint const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()]; +#if 0 + printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts); +#endif + bp_elems[idx].u32 = tts; + } + + // + // FIXME -- vectorize with vstoreN() + // + } + + // + // return remaining block ids back to the pool + // + skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next; + + if (blocks_rem > 0) + { + skc_uint bp_idx = 0; + + if (skc_subgroup_lane() == 0) + { + bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem); + +#if 0 + printf("r-: %8u + %u\n",bp_idx,blocks_rem); +#endif + } + + bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask; + + if (skc_subgroup_lane() >= blocks_next) + { + bp_ids[bp_idx] = *blocks; + } + } + + // + // flush work-in-progress ryx keys + // + if (sk_v_next > 0) + { + skc_uint sk_idx = 0; + + if (skc_subgroup_lane() == 0) + { + sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE + (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next); +#if 0 + printf("* %u\n",sk_idx); +#endif + } + + sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane(); + + if (skc_subgroup_lane() < sk_v_next) + { + sk_extent[sk_idx] = *sk_v; + } + } +} + +// +// If there are lanes that were unable to append to a bin because +// their hashes collided with a bin's current ryx key then those bins +// must be ejected. +// +// Note that we do not eject "full" bins because lazily waiting for a +// collision results in simpler code. +// + +static +void +skc_flush(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + skc_block_id_t * const subblocks, + skc_block_id_v_t * const blocks, + skc_uint * const blocks_next, + skc_ttsk_v_t * const sk_v, + skc_uint * const sk_v_next, + __global skc_ttsk_s_t * const sk_extent, + __local struct skc_subgroup_smem volatile * const smem, + SKC_RASTERIZE_UINT const hash, + SKC_RASTERIZE_UINT const yx, + SKC_RASTERIZE_PREDICATE is_collision) // pass by value +{ +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // + + // + // FIXME -- this code is now stale with the changes to the + // subblock/block allocation strategy + // + + // + // get local TTSB ID queue count + // + skc_uint ttsb_id_count = smem->pool.count; // scalar + + // init hash bit mask + skc_uint component_mask = 0; + + for (int cc=0; cc<SKC_RASTERIZE_VECTOR_SIZE; cc++) + { + // if no collision continue + if (((int*)&is_collision)[cc] == 0) + continue; + + uint const winner = ((uint*)&hash)[cc]; + uint const component_bit = 1u << winner; + + // if already processed this hash then continue + if (component_mask & component_bit) + continue; + + // update component mask + component_mask |= component_bit; + + // + // new winner requires ejecting the old TTSB + // + if (smem->bin.aN.count[winner] > 0) + { + skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); + + bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()]; + } + + // + // ensure there is at least one TTSK and TTSB ID + // + if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE) + { + // + // update remaining count + // + ttsb_id_count = 0; + + // + // flush accumulated ttsk_ryx keys + // + uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE + (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count + +#if 0 + printf("# %u\n",idx); +#endif + + for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE) + { + ttsk_ryx[idx + ii] = skc_make_ttsk_ryx(smem,SKC_CMD_RASTERIZE_GET_COHORT(cmd),ii); + } + + // + // allocate more ttsb ids from pool + // + uint const id = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+0,SKC_RASTERIZE_POOL_SIZE); // ring_reads + + for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE) + smem->pool.aN.id[ii] = bp_ids[id + ii]; + } + + // + // invalidate the winning block + // + + // + // update bin with winning yx, new ttsb id and zero count + // + // all lanes are loading/storing from/to the same index + // + smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID ); + smem->bin.aN.id [winner] = smem->pool.aN.id[ttsb_id_count]; + smem->bin.aN.yx [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc]; + smem->bin.aN.count[winner] = 0; + + // + // update count + // + ttsb_id_count += 1; + } + + // + // save count + // + smem->pool.count = ttsb_id_count; + +#else + // + // SIMT + // + + do { + // + // only one lane will win! + // + if (is_collision) + smem->subgroup.winner = hash; + + barrier(CLK_LOCAL_MEM_FENCE); + + // + // which bin is being ejected? + // + skc_uint const winner = smem->subgroup.winner; + + // + // which colliding hash is taking over the bin? + // + SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner); + + // + // all lanes with the same hash will try to store but only one + // lane will win + // + if (is_winner) + smem->subgroup.winner = yx; + + barrier(CLK_LOCAL_MEM_FENCE); + + // + // flush this block to the pool + // + if (smem->bin.aN.count[winner] > 0) + { + skc_block_id_v_t const id = smem->bin.aN.id[winner]; + skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); + skc_uint const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()]; +#if 0 + printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts); +#endif + bp_elems[idx].u32 = tts; + } + + // + // append new ttsk + // + skc_uint const new_yx = smem->subgroup.winner; + skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(), + blocks_next, + bp_atomics, + bp_mask, // pow2 modulo mask for block pool ring + bp_ids, + cohort_atomics, + sk_v, + sk_v_next, + sk_extent, + new_yx); + +#if 0 + if (get_sub_group_local_id() == 0) { + printf(">>> %9u\n",new_id); + } +#endif + + // + // update bin with winning yx, new ttsb id and zero count + // + smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID; + smem->bin.aN.yx [winner] = new_yx; + smem->bin.aN.id [winner] = new_id; + smem->bin.aN.count[winner] = 0; + + // + // remove all lanes matching this hash + // + is_collision = is_collision && !is_winner; + + // + // exit if nothing left to do + // + } while (sub_group_any(is_collision)); + +#endif +} + +// +// scatter scan max +// +static +SKC_RASTERIZE_UINT +skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem, + SKC_RASTERIZE_FLOAT const iss, + SKC_RASTERIZE_FLOAT const ess) +{ + // + // prefix sums determine which lanes we're going to work on next + // + SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP); + SKC_RASTERIZE_UINT const scratch_idx = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f)); + +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // +#ifdef SKC_RASTERIZE_SIMD_USES_SMEM + // + // SIMD APPROACH 1: SIMT'ISH + // + + // zero the volatile smem scratchpad using vector syntax + smem->subgroup.vN.scratch[0] = ( 0 ); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + if (is_scratch_store C) \ + smem->subgroup.aN.scratch[scratch_idx C] = I; + + SKC_RASTERIZE_VECTOR_EXPAND(); + + // propagate lanes to right using max scan + SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0]; + SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch); + +#else + // + // SIMD APPROACH 2: SCALAR'ISH + // + + SKC_RASTERIZE_UINT source = ( 0 ); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + if (is_scratch_store C) \ + ((uint *)&source)[scratch_idx C] = I; + + SKC_RASTERIZE_VECTOR_EXPAND(); + + for (uint ii=1; ii<SKC_RASTERIZE_ELEMS_PER_SUBGROUP; ii++) + ((uint *)&source)[ii] = max(((uint *)&source)[ii-1],((uint *)&source)[ii]); +#endif + +#else + // + // SIMT + // + + // + // zero the volatile smem scratchpad using vector syntax + // + smem->subgroup.vN.scratch[skc_subgroup_lane()] = ( 0 ); + + // + // store source lane at starting lane + // + if (is_scratch_store) + smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane(); + + // + // propagate lanes to right using max scan + // + SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()]; + SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch); +#endif + + return source; +} + +// +// sliver lines into subpixels +// + +static +void +skc_sliver(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + skc_block_id_t * const subblocks, + skc_block_id_v_t * const blocks, + skc_uint * const blocks_next, + skc_ttsk_v_t * const sk_v, + skc_uint * const sk_v_next, + __global skc_ttsk_s_t * const sk_extent, + __local struct skc_subgroup_smem volatile * const smem, + SKC_RASTERIZE_FLOAT const l0x, + SKC_RASTERIZE_FLOAT const l0y, + SKC_RASTERIZE_FLOAT const l1x, + SKC_RASTERIZE_FLOAT const l1y) +{ + // + // Y-SLIVERING + // ----------- + // + // immediately sliver all multi-pixel lines in into 1-pixel high + // lines + // + // note this implicitly squelches horizontal lines + // + // there is another test for horizontal lines after x-slivering + // is complete + // + + // + // will we need to flip the sign of y_delta ? + // + SKC_RASTERIZE_PREDICATE const y_lt = (l0y <= l1y); + SKC_RASTERIZE_UINT const dy_xor = y_lt ? 0 : 0x80000000; + + // + // save 1/dy + // + SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y); + + // + // how many non-horizontal subpixel y-axis slivers are there? + // + SKC_RASTERIZE_FLOAT const y_min = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN); + SKC_RASTERIZE_FLOAT const y_max = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN); + SKC_RASTERIZE_FLOAT const y_base = y_lt ? y_min : y_max; + SKC_RASTERIZE_FLOAT y_segs = y_max - y_min; + + // + // inclusive subgroup scan of y_segs + // + SKC_RASTERIZE_FLOAT y_iss = skc_subgroup_scan_inclusive_add_float(y_segs); + SKC_RASTERIZE_FLOAT y_ess = y_iss - y_segs; + float y_rem = skc_subgroup_last_float(y_iss); + + // + // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails + // + if (y_segs == 0.0f) + y_iss = 0.0f; + +#if 0 + printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem); +#endif + + // + // these values don't matter on first iteration + // + SKC_RASTERIZE_FLOAT n1x_prev = 0; + SKC_RASTERIZE_FLOAT n1y_prev = 0; + + // + // loop until done + // + while (y_rem > 0.0f) + { + // + // distribute work across lanes + // + SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess); + + // + // get line at y_source line + // + SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source); + SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source); + SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source); + SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source); + + // + // every lane will create a 1 pixel tall line "sliver" + // + // FIXME -- this gets expanded on SIMD + // + // if numerator == 1 then this is the first lane + // if numerator == s then this is the last lane + // + SKC_RASTERIZE_FLOAT const y_delta = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source); + SKC_RASTERIZE_FLOAT const y_count = skc_subgroup_shuffle(y_segs,y_source); + + SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f); + SKC_RASTERIZE_PREDICATE const is_y_last = (y_delta >= y_count); + + // toggle y_delta sign + SKC_RASTERIZE_FLOAT const y_offset = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source))); + + // + // calculate "right" line segment endpoint + // + SKC_RASTERIZE_FLOAT n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP; + SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source); + SKC_RASTERIZE_FLOAT n1x = round(SKC_LERP(m0x,m1x,n_t)); + + // + // override c1 if this is last point + // + n1y = select(n1y,m1y,is_y_last); + n1x = select(n1x,m1x,is_y_last); + + // + // shuffle up "left" line segment endpoint + // + // NOTE: Intel's shuffle_up is unique with its elegant + // "previous" argument so don't get used to it + // + SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y); + SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x); + + // + // override shuffle up if this is the first line segment + // + n0y = select(n0y,m0y,is_y_first); + n0x = select(n0x,m0x,is_y_first); + + // + // save previous right endpoint + // + n1x_prev = n1x; + n1y_prev = n1y; + + // + // decrement by subgroup size + // + y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + +#if 0 + // + // debug + // + if (n0y != n1y) { + printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y); + } +#endif + + // + // X-SLIVERING + // ----------- + // + // now sliver 1-pixel high lines into at either vertical or + // 1-pixel wide lines + // + // save original direction and work with increasing x + // + SKC_RASTERIZE_PREDICATE const x_lt = (n0x <= n1x); + SKC_RASTERIZE_UINT const dx_xor = x_lt ? 0 : 0x80000000; + + // + // save 1/dy + // + SKC_RASTERIZE_FLOAT const x_denom = native_recip(n1x - n0x); + + // + // how many non-horizontal subpixel y-axis slivers are there? + // + SKC_RASTERIZE_FLOAT const x_min = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN); + SKC_RASTERIZE_FLOAT const x_max = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN); + SKC_RASTERIZE_FLOAT const x_base = x_lt ? x_min : x_max; + SKC_RASTERIZE_FLOAT const x_segs = fmax(x_max - x_min,1.0f); + + // + // inclusive subgroup scan of y_segs + // + SKC_RASTERIZE_FLOAT x_iss = skc_subgroup_scan_inclusive_add_float(x_segs); + SKC_RASTERIZE_FLOAT x_ess = x_iss - x_segs; + float x_rem = skc_subgroup_last_float(x_iss); + + // + // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails + // + //if (x_segs == 0.0f) + // x_iss = 0.0f; + + // + // these values don't matter on first iteration + // + SKC_RASTERIZE_FLOAT p1x_prev = 0; + SKC_RASTERIZE_FLOAT p1y_prev = 0; + + // + // loop until done + // + while (x_rem > 0) + { + // + // distribute work across lanes + // + SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess); + + // + // get line at y_source line + // + SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source); + SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source); + SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source); + SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source); + + // + // every lane will create a 1 pixel tall line "sliver" + // + // FIXME -- this gets expanded on SIMD + // + // if numerator == 1 then this is the first lane + // if numerator == s then this is the last lane + // + SKC_RASTERIZE_FLOAT const x_delta = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source); + SKC_RASTERIZE_FLOAT const x_count = skc_subgroup_shuffle(x_segs,x_source); + + SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f); + SKC_RASTERIZE_PREDICATE const is_x_last = (x_delta >= x_count); + + // toggle x_delta sign + SKC_RASTERIZE_FLOAT const x_offset = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source))); + + // + // calculate "right" line segment endpoint + // + SKC_RASTERIZE_FLOAT p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP; + SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source); + SKC_RASTERIZE_FLOAT p1y = round(SKC_LERP(o0y,o1y,p_t)); + + // + // override c1 if this is last point + // + p1x = select(p1x,o1x,is_x_last); + p1y = select(p1y,o1y,is_x_last); + + // + // shuffle up "left" line segment endpoint + // + // NOTE: Intel's shuffle_up is unique with its elegant + // "previous" argument so don't get used to it + // + SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x); + SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y); + + // + // override shuffle up if this is the first line segment + // + p0x = select(p0x,o0x,is_x_first); + p0y = select(p0y,o0y,is_x_first); + + // + // save previous right endpoint + // + p1x_prev = p1x; + p1y_prev = p1y; + + // + // decrement by subgroup size + // + x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + + // + // only non-horizontal subpixel lines are valid + // + SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y); + + // + // if no lanes are active then continue + // + // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY + // IMPACTS PERFORMANCE (+12% ?) + // + // IT SHOULDN'T !!! + // +#if 0 + if (!skc_subgroup_any(is_active)) + continue; +#endif + + // + // Option 1: use SLM for explicitly managed coalesced stores + // + // 1. which tile does this line belong? + // 2. hash tile coordinates + // 3. lookup hash + // 4. if tile matches then SLM append keys + // 5. if tile doesn't match + // a. flush + // b. create new TTSK_RYX + // c. obtain TTSB block from pool + // d. goto 3. + // + + // + // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores + // + // 1. which tile does this line belong? + // 2. hash tile coordinates + // 3. lookup hash + // 4. if tile matches then GMEM append keys + // 5. if tile doesn't match + // a. flush (and invalidate empty elems) + // b. create new TTSK_RYX + // c. obtain TTSB block from pool + // d. goto 3. + // + + // + // The virtual rasterization surface is very large and + // signed: +/- ~64K-256K, depending on the architecture. + // + // Rasters must be clipped to the virtual surface and, + // optionally, clipped even further on a per raster + // basis. + // + + // + // Clip to the per-raster clip + // + + /* + + CLIP HERE + + */ + + // + // Hash the tile coordinates + // + // This table lists nominal values for each architecture. + // We want to choose values that are naturally fit the + // "width" of the architecture. + // + // SIMD RANGE BITS MAX RANGE MAX BINS HASH BITS + // ---- ------- ---- --------- -------- --------- + // 4 [0, 4] 3 [0, 7] 10 mod(10) <-- SSE42, ? + // 8 [0, 8] 4 [0, 15] 8 3 <-- GEN*,AVX* + // 16 [0, 16] 5 [0, 31] 6 mod(6) <-- GEN*,? + // 32 [0, 32] 6 [0, 63] 5 mod(5) <-- CUDA,PowerVR,Adreno,GEN* + // 64 [0, 64] 7 [0,127] 4 2 <-- AMD Radeon + // + // NOTE: When possible, bias the hash toward using more y + // bits because of: + // + // 1. the 90 degree counter-clockwise rotation that we put + // in place to offset the render-time clockwise + // rotation + // + // 2. the likely presence of left-to-right or + // right-to-left glyphs. + // + // For power-of-two bins, the hash is easy. + // + // For non-power-of-two, we may want to either implement a + // fast mod (compiler should do this for us... hahahaha) or + // drop down to the next power-of-two. + // + + // + // FIXME -- this snarl is not good -- can probably reduce + // some of the sign casting but some is there to vectorize a + // scalar + // + SKC_RASTERIZE_INT const z0y = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y); + SKC_RASTERIZE_INT const z1y = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y); + + SKC_RASTERIZE_INT const z0x = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x); + SKC_RASTERIZE_INT const z1x = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x); + + SKC_RASTERIZE_INT const min_y = min(z0y,z1y); + SKC_RASTERIZE_INT const max_y = max(z0y,z1y); + + SKC_RASTERIZE_INT const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2; + + SKC_RASTERIZE_UINT const ty = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y; + SKC_RASTERIZE_INT dy = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y); + + // + // map [+1,+32] to [ 0,+31] + // map [-1,-32] to [-1,-32] + // + SKC_RASTERIZE_INT dys = (dy + (~dy >> 31)) << 26; + + SKC_RASTERIZE_INT const min_x = min(z0x,z1x); + SKC_RASTERIZE_INT const max_x = max(z0x,z1x); + SKC_RASTERIZE_INT const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2; + + SKC_RASTERIZE_UINT const tx = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X; + SKC_RASTERIZE_UINT const sx = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x); + + SKC_RASTERIZE_UINT const tts = dys | (ty << 16) | (sx << 10) | tx; + + SKC_RASTERIZE_UINT const hash = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) | + (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK)); + + SKC_RASTERIZE_UINT const yx = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF)); + +#if 0 + printf("(%3u, %3u)\n",tile_y,tile_x); +#endif + +#if 0 + if (is_active) + printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx); +#endif + + // + // debug + // +#if 0 // PRINTF_ENABLE + +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + if (is_active C) \ + printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C); + + SKC_RASTERIZE_VECTOR_EXPAND(); +#else + if (is_active) + printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash); +#endif + +#endif + // + // flush all active lanes + // + while (true) + { + // + // either gather load or vector load+shuffle the yx keys + // +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + SKC_RASTERIZE_BIN const yx_bin = smem->bin.vN.yx; + SKC_RASTERIZE_UINT const yx_cur = shuffle(yx_bin,hash); +#else + SKC_RASTERIZE_UINT const yx_cur = smem->bin.aN.yx[hash]; +#endif + + // + // does yx for lane match yx for hash? + // + SKC_RASTERIZE_UINT const active_yx = is_active ? yx : SKC_RASTERIZE_YX_INVALID; + SKC_RASTERIZE_PREDICATE const is_match = (yx_cur == active_yx); + + // + // OpenCL spec: "When casting a bool to a vector integer + // data type, the vector components will be set to -1 + // (i.e. all bits set) if the vector bool value is true + // and 0 otherwise. + // +#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) + SKC_RASTERIZE_UINT const h_match = (SKC_RASTERIZE_UINT)is_match; +#else + SKC_RASTERIZE_UINT const h_match = abs(is_match); // {-1,0} -> {+1,0} +#endif + // + // how many new elements for each matching hash bin? + // + SKC_RASTERIZE_UINT const h_shl = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS; + SKC_RASTERIZE_UINT const h = h_match << h_shl; + + // + // prefix sum all of the bins in parallel + // + SKC_RASTERIZE_UINT const h_iss = skc_subgroup_scan_inclusive_add_uint(h); + SKC_RASTERIZE_UINT const h_total = skc_subgroup_last_uint(h_iss); + + // + // current bin counts + // +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + SKC_RASTERIZE_BIN const count_bin = smem->bin.vN.count; + SKC_RASTERIZE_UINT const count_cur = shuffle(count_bin,hash); +#else + SKC_RASTERIZE_UINT const count_cur = smem->bin.aN.count[hash]; +#endif + + // + // calculate where each cache-hit and in-bounds tts should be stored + // + SKC_RASTERIZE_UINT const ttsb_index = (h_iss >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1; + SKC_RASTERIZE_UINT const count_new = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur; + + // + // which lanes can append to a matching bin? + // + SKC_RASTERIZE_PREDICATE const is_append = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS); + + // + // scatter append tts elements to bin blocks + // +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1) + // + // SIMD + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + if (is_append C) \ + { \ + smem->bin.aN.ttsb [hash C][ttsb_index C] = tts C; \ + smem->bin.aN.count[hash C] = count_new C; \ + } + + SKC_RASTERIZE_VECTOR_EXPAND(); +#else + // + // SIMT + // + if (is_append) + { + smem->bin.aN.ttsb [hash][ttsb_index] = tts; + smem->bin.aN.count[hash] = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS + } +#endif + // + // try to keep predicate updates SIMD-friendly and + // outside of predicated code paths -- this is not + // always how we would normally do things on SIMT but + // either approach is acceptable + // + + // + // mask off lanes/components that successfully appended + // + is_active = is_active && !is_append; + + // + // are there any active lanes left? + // + if (!skc_subgroup_any(is_active)) + break; + + // + // There are active lanes that couldn't be appended to a + // bin because their hashes collided with the bin's + // current ryx key then those bins must be ejected. + // + // Note that we do not eject "full" bins because lazily + // waiting for a collision results in simpler code. + // + skc_flush(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + subblocks, + blocks, + blocks_next, + sk_v, + sk_v_next, + sk_extent, + smem, + hash, + yx, + is_active); + } + } + } +} + +// +// INITIALIZE SMEM +// +// Note that SIMD/SIMT have nearly the same syntax. +// +static +void +skc_smem_init(__local struct skc_subgroup_smem volatile * const smem) +{ + // + // initialize smem bins + // +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + // + // SIMD + // + smem->bin.vN.yx = ( SKC_RASTERIZE_YX_INIT ); + smem->bin.vN.count = ( 0 ); +#else + // + // SIMT + // + int idx = skc_subgroup_lane(); + +#if ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP ) + if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT) +#elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP ) + for (; idx<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; idx+=SKC_RASTERIZE_SUBGROUP_SIZE) +#endif + { + smem->bin.aN.yx [idx] = ( SKC_RASTERIZE_YX_INIT ); + smem->bin.aN.count[idx] = ( 0 ); + } +#endif +} + +// +// RASTERIZE CUBIC KERNEL +// + +static +void +skc_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __local struct skc_subgroup_smem volatile * const smem, + + skc_uint * const nodeword, + skc_block_id_t * const id, + + union skc_transform const * const tv, + union skc_path_clip const * const cv, + skc_uint const cohort) +{ + // + // the initial segment idx and segments-per-block constant determine + // how many block ids will need to be loaded + // + SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + // + // apply transform + // + // note that we only care if the end points are rounded to subpixel precision + // + // FIXME -- transformation is currently affine-only support perspective later + // + // the affine transformation requires 8 FMA + 2 ROUND operations + // + SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx); + SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty); + + SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx + c1y * tv->shx + tv->tx; + SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy + tv->ty; + + SKC_RASTERIZE_FLOAT const t2x = c2x * tv->sx + c2y * tv->shx + tv->tx; + SKC_RASTERIZE_FLOAT const t2y = c2x * tv->shy + c2y * tv->sy + tv->ty; + + SKC_RASTERIZE_FLOAT const t3x = round(c3x * tv->sx + c3y * tv->shx + tv->tx); + SKC_RASTERIZE_FLOAT const t3y = round(c3x * tv->shy + c3y * tv->sy + tv->ty); + + // + // + // +#if PRINTF_ENABLE + +#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + printf("{ { %.02f, %.02f }, { %.02f, %.02f }," \ + " { %.02f, %.02f }, { %.02f, %.02f } },\n", \ + b0x C,b0y C,t1x C,t1y C, \ + t2x C,t2y C,t3x C,t3y C); + + SKC_RASTERIZE_VECTOR_EXPAND(); + +#else + + printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n", + b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y); + +#endif + +#endif + + // + // OLD APPROACH + // ------------ + // + // The Spinel CUDA rasterizer was significantly more complex and + // performed a few different tasks that are probably best kept + // separate. + // + // The Spinel rasterizer Bezier held 4-element x and y coordinates + // in adjacent lanes. This simplified intermingling of single lane + // 4-coordinate line segments with two-lane cubic Beziers. + // + // After transformation of the input segments, the Spinel rasterizer + // would test cubics for flatness and, if flat, collapse the + // adjacent lanes into a single line lane and an empty lane. + // + // Any lines would then be appended to a line queue. + // + // Any cubics would then be subdivided. + // + // The reclassification process would be repeated. + // + // NEW APPROACH + // ------------ + // + // Assume we're only working with cubics in this kernel. + // + // Optimization: if the line segment is a special case -- a cusp, + // has 1+ inflections, or a loop -- it might be beneficial to + // subdivide the control cage 1+ times in order to separate the + // flatter segments the high-velocity region(s). + // + // This means we want to split using [a,b] formulation to _directly_ + // subdivide producing a new control cage. + // + // Wang's Formula is still useful even if we subdivide once or twice + // as it's so cheap that it might give some useful hints about where + // the high-velocity sections of curve reside. + // + // But it seems like using Wang's and directly flattening to line + // segments without any subdivision is good enough for the limited + // set of test cases that I've tried. + // + // So... use Wang's Formula to estimate how many line segment are + // required to properly flatten the cubics. + // + // Then use inclusive/exclusive scans to put all the lanes to work: + // + // 1. segmenting cubics to line segments + // + // 2. slivering line segments into 1-pixel high line segments + // + // 3. slivering 1-pixel high line segments into 1-pixel wide line + // segments + // + // MORE BACKGROUND ON NEW APPROACH + // ------------------------------- + // + // Two options for handling line segments: + // + // 1. append the line segments onto an SLM array until enough + // work has been accrued (Spinel does this) + // + // 2. immediately sliver the potentially multi-pixel line + // segments into subpixel lines + // + // The advantage of (1) is that it guarantees the slivering + // process will, on average, always be emitting a full subgroup + // of subpixel lines. + // + // The advantage of (2) is that it reduces code complexity and + // leaves more room for SLM tile bins. The difference between Spinel + // and Skia Compute is that Wang's Formula guarantees there will be + // a full subgroup of multi-pixel lines unless this is the final + // iteration of the warp of multi-pixel lines. + // + // Note that wider GPU architectures might benefit from (1) and + // other work accumulation strategies because it will minimize + // partial warp workloads in the final iteration of each stage. It + // also minimizes the sunk cost of the uniform control logic steps. + // + // So let's implement (2) for now... + // + + // + // And... begin! + // + // Estimate how many line segments are in quad/cubic curve. + // + // Wang's Formula will return zero if the control points are + // collinear but we bump it up to 1.0f. + // + SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y); + + // + // if there are free registers then precalculate the reciprocal for + // each estimated segments since it will never change + // + SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs); + + + // + // inclusive add scan of estimated line segments + // exclusive add scan of estimated line segments + // total number of estimated line segments + // + SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs); + SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs; + float s_rem = skc_subgroup_last_float(s_iss); // scalar + + // + // Precompute cubic polynomial coefficients from transformed control + // cage so we can shuffle them in on each iteration of the outer + // loop and then evaluate the polynomial in Horner form. + // + // | 1 0 0 0 | | c0 | + // | | | | + // | -3 3 0 0 | | c1 | + // B(t) = [ 1 t^1 t^2 t^3 ] | | | | + // | 3 -6 3 0 | | c2 | + // | | | | + // | -1 3 -3 1 | | c3 | + // + // + SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x); // 2 - 1 MAD + MUL + SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y); // 2 - 1 MAD + MUL + + SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x)); // 3 - 2 MAD + MUL + SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y)); // 3 - 2 MAD + MUL + + SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB + SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB + + // + // these values don't matter on the first iteration + // + SKC_RASTERIZE_FLOAT l1x_prev = 0; + SKC_RASTERIZE_FLOAT l1y_prev = 0; + + // + // allocate and init in-register TTSK keys + // + skc_uint sk_v_next = 0; + skc_ttsk_v_t sk_v; + + sk_v.hi = cohort; + + // + // initialize smem + // + skc_smem_init(smem); + + // + // initialize blocks / subblocks + // + skc_block_id_v_t blocks; + skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; + +#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 + skc_block_id_t subblocks = 0; +#endif + + // + // loop until done + // + while (s_rem > 0) + { + // + // distribute work across lanes + // + SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess); + + // + // every lane has a fraction to work off of + // + // FIXME -- this gets expanded on SIMD + // + // if delta == 1 then this is the first lane + // if count == s_segs then this is the last lane + // + SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source); + SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source); + + SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f); + SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count); + + // + // init parametric t + // + SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)? + + // + // if last then override to a hard 1.0f + // + s_t = is_s_last ? 1.0f : s_t; + + // + // decrement by subgroup size + // + s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + + // + // now every lane knows what to do and the following lines will + // pump out up to SUBGROUP_SIZE line segments + // + // obtain the src vertices through shared or via a shuffle + // + + // + // shuffle in the polynomial coefficients their source lane + // + SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source); + SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source); + + SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source); + SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source); + + SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source); + SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source); + + SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source); + SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source); + + // + // calculate "right" line segment endpoint using Horner form + // + SKC_RASTERIZE_FLOAT l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND + SKC_RASTERIZE_FLOAT l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND + + // + // shuffle up "left" line segment endpoint + // + // NOTE: Intel's shuffle_up is unique with its elegant + // "previous" argument so don't get used to it + // + SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x); + SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y); + + // + // save previous right endpoint + // + l1x_prev = l1x; + l1y_prev = l1y; + + // + // override shuffle up if this is the first line segment + // + l0x = select(l0x,s0x,is_s_first); + l0y = select(l0y,s0y,is_s_first); + + // + // sliver lines + // + skc_sliver(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + &subblocks, + &blocks, + &blocks_next, + &sk_v, + &sk_v_next, + sk_extent, + smem, + l0x,l0y,l1x,l1y); + } + + // + // - flush work-in-progress blocks + // - return unused block ids + // + skc_finalize(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + &blocks, + blocks_next, + &sk_v, + sk_v_next, + sk_extent, + smem); +} + +// +// RASTERIZE QUAD KERNEL +// + +static +void +skc_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __local struct skc_subgroup_smem volatile * const smem, + + skc_uint * const nodeword, + skc_block_id_t * const id, + + union skc_transform const * const tv, + union skc_path_clip const * const cv, + skc_uint const cohort) +{ + // + // the initial segment idx and segments-per-block constant determine + // how many block ids will need to be loaded + // + SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + // + // apply transform + // + // note that we only care if the end points are rounded to subpixel precision + // + // FIXME -- transformation is currently affine-only support perspective later + // + // the affine transformation requires 8 FMA + 2 ROUND operations + // + SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx); + SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty); + + SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx + c1y * tv->shx + tv->tx; + SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy + tv->ty; + + SKC_RASTERIZE_FLOAT const t2x = round(c2x * tv->sx + c2y * tv->shx + tv->tx); + SKC_RASTERIZE_FLOAT const t2y = round(c2x * tv->shy + c2y * tv->sy + tv->ty); + + // + // Estimate how many line segments are in quad/cubic curve. + // + // Wang's Formula will return zero if the control points are + // collinear but we bump it up to 1.0f. + // + SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y); + + // + // if there are free registers then precalculate the reciprocal for + // each estimated segments since it will never change + // + SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs); + + + // + // inclusive add scan of estimated line segments + // exclusive add scan of estimated line segments + // total number of estimated line segments + // + SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs); + SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs; + float s_rem = skc_subgroup_last_float(s_iss); // scalar + + // + // Precompute quadratic polynomial coefficients from control cage so + // we can shuffle them in on each iteration of the outer loop and + // then evaluate the polynomial in Horner form. + // + + // | 1 0 0 | | c0 | + // | | | | + // B(t) = [ 1 t^1 t^2 ] | -2 2 0 | | c1 | + // | | | | + // | 1 -2 1 | | c2 | + // + // + SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL + SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL + + SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x); // 2 - 1 MAD + ADD + SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y); // 2 - 1 MAD + ADD + + // + // these values don't matter on the first iteration + // + SKC_RASTERIZE_FLOAT l1x_prev = 0; + SKC_RASTERIZE_FLOAT l1y_prev = 0; + + // + // allocate and init in-register TTSK keys + // + skc_uint sk_v_next = 0; + skc_ttsk_v_t sk_v; + + sk_v.hi = cohort; + + // + // initialize smem + // + skc_smem_init(smem); + + // + // initialize blocks / subblocks + // + skc_block_id_v_t blocks; + skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; + +#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 + skc_block_id_t subblocks = 0; +#endif + + // + // loop until done + // + while (s_rem > 0) + { + // + // distribute work across lanes + // + SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess); + + // + // every lane has a fraction to work off of + // + // FIXME -- this gets expanded on SIMD + // + // if delta == 1 then this is the first lane + // if count == s_segs then this is the last lane + // + SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source); + SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source); + + SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f); + SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count); + + // + // init parametric t + // + SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)? + + // + // if last then override to a hard 1.0f + // + s_t = is_s_last ? 1.0f : s_t; + + // + // decrement by subgroup size + // + s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; + + // + // now every lane knows what to do and the following lines will + // pump out up to SUBGROUP_SIZE line segments + // + // obtain the src vertices through shared or via a shuffle + // + + // + // shuffle in the polynomial coefficients their source lane + // + SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source); + SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source); + + SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source); + SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source); + + SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source); + SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source); + + // + // calculate "right" line segment endpoint using Horner form + // + SKC_RASTERIZE_FLOAT l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND + SKC_RASTERIZE_FLOAT l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND + + // + // shuffle up "left" line segment endpoint + // + // NOTE: Intel's shuffle_up is unique with its elegant + // "previous" argument so don't get used to it + // + SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x); + SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y); + + // + // save previous right endpoint + // + l1x_prev = l1x; + l1y_prev = l1y; + + // + // override shuffle up if this is the first line segment + // + l0x = select(l0x,s0x,is_s_first); + l0y = select(l0y,s0y,is_s_first); + + // + // sliver lines + // + skc_sliver(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + &subblocks, + &blocks, + &blocks_next, + &sk_v, + &sk_v_next, + sk_extent, + smem, + l0x,l0y,l1x,l1y); + } + + // + // - flush work-in-progress blocks + // - return unused block ids + // + skc_finalize(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + &blocks, + blocks_next, + &sk_v, + sk_v_next, + sk_extent, + smem); +} + +// +// RASTERIZE LINE KERNEL +// + +static +void +skc_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __local struct skc_subgroup_smem volatile * const smem, + + skc_uint * const nodeword, + skc_block_id_t * const id, + + union skc_transform const * const tv, + union skc_path_clip const * const cv, + skc_uint const cohort) +{ + // + // the initial segment idx and segments-per-block constant determine + // how many block ids will need to be loaded + // + SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + + skc_segment_next(bp_elems,nodeword,id); + + SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; + +#if 0 + // printf("%5u : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",(skc_uint)get_global_id(0),c0x,c0y,c1x,c1y); + printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y); +#endif + + // + // apply transform + // + // note that we only care if the end points are rounded to subpixel precision + // + // FIXME -- transformation is currently affine-only + // FIXME -- support perspective later + // + // the affine transformation requires 8 FMA + 4 ROUND operations + // + SKC_RASTERIZE_FLOAT const l0x = round(c0x * tv->sx + c0y * tv->shx + tv->tx); + SKC_RASTERIZE_FLOAT const l0y = round(c0x * tv->shy + c0y * tv->sy + tv->ty); + + SKC_RASTERIZE_FLOAT const l1x = round(c1x * tv->sx + c1y * tv->shx + tv->tx); + SKC_RASTERIZE_FLOAT const l1y = round(c1x * tv->shy + c1y * tv->sy + tv->ty); + +#if 0 + printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y); +#endif + + // + // allocate and init in-register TTSK keys + // + skc_uint sk_v_next = 0; + skc_ttsk_v_t sk_v; + + sk_v.hi = cohort; + + // + // initialize smem + // + skc_smem_init(smem); + + // + // initialize blocks / subblocks + // + skc_block_id_v_t blocks; + skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; + +#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 + skc_block_id_t subblocks = 0; +#endif + + // + // sliver lines + // + skc_sliver(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + &subblocks, + &blocks, + &blocks_next, + &sk_v, + &sk_v_next, + sk_extent, + smem, + l0x,l0y,l1x,l1y); + + // + // - flush work-in-progress blocks + // - return unused block ids + // + skc_finalize(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + &blocks, + blocks_next, + &sk_v, + sk_v_next, + sk_extent, + smem); +} + +// +// +// + +__kernel +SKC_RASTERIZE_KERNEL_ATTRIBS +void +skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __global float8 const * const transforms, // FIXME -- __constant + __global float4 const * const clips, // FIXME -- __constant + __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant + skc_uint const count) +{ + // + // declare shared memory block + // +#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) + __local struct skc_subgroup_smem volatile smem[1]; +#else + __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; + __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); +#endif + + // + // this is a subgroup/warp-centric kernel + // + // which subgroup in the grid is this? + // + // TAKE NOTE: the Intel GEN compiler appears to be recognizing + // get_group_id(0) as a uniform but the alternative calculation used + // when there are multiple subgroups per workgroup is not + // cooperating and driving spillage elsewhere. + // +#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) + uint const cmd_idx = get_group_id(0); +#else + uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); +#endif + +#if 0 + if (get_sub_group_local_id() == 0) + printf("+cmd_idx = %u\n",cmd_idx); +#endif + + // + // if worksgroups are multi-subgroup then there may be excess + // subgroups in the final workgroup + // + if (cmd_idx >= count) + return; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("-cmd_idx = %u\n",cmd_idx); +#endif + + // + // load a single command for this subgroup + // + union skc_cmd_rasterize const cmd = cmds[cmd_idx]; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("[ %u ]< %u, %u, %u, %u >\n", + cmd_idx, + cmd.nodeword, + SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd), + SKC_CMD_RASTERIZE_GET_CLIP(cmd), + SKC_CMD_RASTERIZE_GET_COHORT(cmd)); +#endif + + // + // get first block node command word and its subblock + // + skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing + skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; + skc_block_id_tag tag = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id); + skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); + + // + // load transform -- uniform across subgroup + // + // v8: { sx shx tx shy sy ty w0 w1 } + // + // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: + // + // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] + // + // Coordinates are scaled to subpixel resolution. All that matters + // is that continuity is maintained between end path element + // endpoints. + // + // It's the responsibility of the host to ensure that the transforms + // are properly scaled either via intitializing a transform stack + // with the subpixel resolution scaled identity or scaling the + // transform before its loaded by a rasterization grid. + // + // FIXME -- horizontal load might be better than this broadcast load + // + union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load + union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load + skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted + + switch (tag) + { + case SKC_BLOCK_ID_TAG_PATH_LINE: + skc_rasterize_lines(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + sk_extent, + smem, + &nodeword,&id, + &tv,&cv,cohort); + break; + + case SKC_BLOCK_ID_TAG_PATH_QUAD: + skc_rasterize_quads(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + sk_extent, + smem, + &nodeword,&id, + &tv,&cv,cohort); + break; + + case SKC_BLOCK_ID_TAG_PATH_CUBIC: + skc_rasterize_cubics(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + sk_extent, + smem, + &nodeword,&id, + &tv,&cv,cohort); + break; + + case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD: + break; + case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC: + break; + + default: + break; + } +} + +// +// +// + +__kernel +SKC_RASTERIZE_KERNEL_ATTRIBS +void +skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __global float8 const * const transforms, // FIXME -- __constant + __global float4 const * const clips, // FIXME -- __constant + __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant + skc_uint const count) +{ + // + // declare shared memory block + // +#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) + __local struct skc_subgroup_smem volatile smem[1]; +#else + __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; + __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); +#endif + + // + // this is a subgroup/warp-centric kernel + // + // which subgroup in the grid is this? + // + // TAKE NOTE: the Intel GEN compiler appears to be recognizing + // get_group_id(0) as a uniform but the alternative calculation used + // when there are multiple subgroups per workgroup is not + // cooperating and driving spillage elsewhere. + // +#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) + uint const cmd_idx = get_group_id(0); +#else + uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); +#endif + + // + // if worksgroups are multi-subgroup then there may be excess + // subgroups in the final workgroup + // + if (cmd_idx >= count) + return; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("cmd_idx = %u\n",cmd_idx); +#endif + + // + // load a single command for this subgroup + // + union skc_cmd_rasterize const cmd = cmds[cmd_idx]; + + // + // get first block node command word and its subblock + // + skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing + skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; + skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); + + // + // load transform -- uniform across subgroup + // + // v8: { sx shx tx shy sy ty w0 w1 } + // + // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: + // + // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] + // + // Coordinates are scaled to subpixel resolution. All that matters + // is that continuity is maintained between end path element + // endpoints. + // + // It's the responsibility of the host to ensure that the transforms + // are properly scaled either via intitializing a transform stack + // with the subpixel resolution scaled identity or scaling the + // transform before its loaded by a rasterization grid. + // + // FIXME -- horizontal load might be better than this broadcast load + // + union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load + union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load + skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted + + skc_rasterize_lines(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + sk_extent, + smem, + &nodeword,&id, + &tv,&cv,cohort); +} + +// +// +// + +// +// +// + +__kernel +SKC_RASTERIZE_KERNEL_ATTRIBS +void +skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __global float8 const * const transforms, // FIXME -- __constant + __global float4 const * const clips, // FIXME -- __constant + __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant + skc_uint const count) +{ + // + // declare shared memory block + // +#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) + __local struct skc_subgroup_smem volatile smem[1]; +#else + __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; + __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); +#endif + + // + // this is a subgroup/warp-centric kernel + // + // which subgroup in the grid is this? + // + // TAKE NOTE: the Intel GEN compiler appears to be recognizing + // get_group_id(0) as a uniform but the alternative calculation used + // when there are multiple subgroups per workgroup is not + // cooperating and driving spillage elsewhere. + // +#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) + uint const cmd_idx = get_group_id(0); +#else + uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); +#endif + + // + // if worksgroups are multi-subgroup then there may be excess + // subgroups in the final workgroup + // + if (cmd_idx >= count) + return; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("cmd_idx = %u\n",cmd_idx); +#endif + + // + // load a single command for this subgroup + // + union skc_cmd_rasterize const cmd = cmds[cmd_idx]; + + // + // get first block node command word and its subblock + // + skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing + skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; + skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); + + // + // load transform -- uniform across subgroup + // + // v8: { sx shx tx shy sy ty w0 w1 } + // + // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: + // + // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] + // + // Coordinates are scaled to subpixel resolution. All that matters + // is that continuity is maintained between end path element + // endpoints. + // + // It's the responsibility of the host to ensure that the transforms + // are properly scaled either via intitializing a transform stack + // with the subpixel resolution scaled identity or scaling the + // transform before its loaded by a rasterization grid. + // + // FIXME -- horizontal load might be better than this broadcast load + // + union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load + union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load + skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted + + skc_rasterize_quads(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + sk_extent, + smem, + &nodeword,&id, + &tv,&cv,cohort); +} + +// +// +// + +__kernel +SKC_RASTERIZE_KERNEL_ATTRIBS +void +skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __global float8 const * const transforms, // FIXME -- __constant + __global float4 const * const clips, // FIXME -- __constant + __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant + skc_uint const count) +{ + // + // declare shared memory block + // +#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) + __local struct skc_subgroup_smem volatile smem[1]; +#else + __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; + __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); +#endif + + // + // this is a subgroup/warp-centric kernel + // + // which subgroup in the grid is this? + // + // TAKE NOTE: the Intel GEN compiler appears to be recognizing + // get_group_id(0) as a uniform but the alternative calculation used + // when there are multiple subgroups per workgroup is not + // cooperating and driving spillage elsewhere. + // +#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) + uint const cmd_idx = get_group_id(0); +#else + uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); +#endif + + // + // if worksgroups are multi-subgroup then there may be excess + // subgroups in the final workgroup + // + if (cmd_idx >= count) + return; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("cmd_idx = %u\n",cmd_idx); +#endif + + // + // load a single command for this subgroup + // + union skc_cmd_rasterize const cmd = cmds[cmd_idx]; + + // + // get first block node command word and its subblock + // + skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing + skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; + skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); + + // + // load transform -- uniform across subgroup + // + // v8: { sx shx tx shy sy ty w0 w1 } + // + // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: + // + // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] + // + // Coordinates are scaled to subpixel resolution. All that matters + // is that continuity is maintained between end path element + // endpoints. + // + // It's the responsibility of the host to ensure that the transforms + // are properly scaled either via intitializing a transform stack + // with the subpixel resolution scaled identity or scaling the + // transform before its loaded by a rasterization grid. + // + // FIXME -- horizontal load might be better than this broadcast load + // + union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load + union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load + skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted + + skc_rasterize_cubics(bp_atomics, + bp_elems, + bp_ids, + bp_mask, + cohort_atomics, + sk_extent, + smem, + &nodeword,&id, + &tv,&cv,cohort); +} + +// +// +// + +__kernel +SKC_RASTERIZE_KERNEL_ATTRIBS +void +skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __global float8 const * const transforms, // FIXME -- __constant + __global float4 const * const clips, // FIXME -- __constant + __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant + skc_uint const count) +{ + ; +} + +// +// +// + +__kernel +SKC_RASTERIZE_KERNEL_ATTRIBS +void +skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global union skc_bp_elem * const bp_elems, + __global uint * const bp_ids, + skc_uint const bp_mask, + + __global SKC_ATOMIC_UINT volatile * const cohort_atomics, + __global skc_ttsk_s_t * const sk_extent, + + __global float8 const * const transforms, // FIXME -- __constant + __global float4 const * const clips, // FIXME -- __constant + __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant + skc_uint const count) +{ + ; +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl b/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl index 0c7da7d0ad..0db21de9b6 100644 --- a/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl +++ b/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl @@ -1,144 +1,144 @@ -/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "tile.h"
-#include "raster.h"
-#include "atomic_cl.h"
-#include "block_pool_cl.h"
-#include "raster_builder_cl_12.h"
-#include "device_cl_12.h"
-
-//
-// There is a fixed-size meta table per raster cohort that we use to
-// peform a mostly coalesced sizing and allocation of blocks.
-//
-// This code is simple and fast.
-//
-
-__kernel
-SKC_RASTERS_ALLOC_KERNEL_ATTRIBS
-void
-skc_kernel_rasters_alloc(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
- __global skc_block_id_t const * const bp_ids,
- skc_uint const bp_mask, // pow2 modulo mask for block pool ring
- __global skc_block_id_t * const map,
- __global skc_uint * const metas,
- __global skc_uint const * const raster_ids, // FIXME -- CONSTANT
- skc_uint const count)
-{
- // access to the meta extent is linear
- skc_uint const gid = get_global_id(0);
- skc_bool const is_active = gid < count;
-
- //
- // init with defaults for all lanes
- //
- union skc_raster_cohort_meta_inout meta = { .in.u32v4 = { 0, 0, 0, 0 } };
- skc_uint raster_id = SKC_UINT_MAX;
- skc_uint extra_blocks = 0;
-
- if (is_active)
- {
- // load meta_in
- meta.in.u32v4 = vload4(gid,metas);
-
- // load raster_id as early as possible
- raster_id = raster_ids[gid];
-
-#if 0
- printf("%3u + %5u, %5u, %5u, %5u\n",
- gid,
- meta.in.blocks,
- meta.in.offset,
- meta.in.pk,
- meta.in.rk);
-#endif
-
- // how many blocks will the ttpb blocks consume?
- extra_blocks = ((meta.in.pk * SKC_TILE_RATIO + SKC_DEVICE_SUBBLOCKS_PER_BLOCK - SKC_TILE_RATIO) /
- SKC_DEVICE_SUBBLOCKS_PER_BLOCK);
-
- // total keys
- meta.out.keys += meta.in.pk;
-
- // how many blocks do we need to store the keys in the head and trailing nodes?
- skc_uint const hn = ((SKC_RASTER_HEAD_DWORDS + meta.out.keys + SKC_RASTER_NODE_DWORDS - 2) /
- (SKC_RASTER_NODE_DWORDS - 1));
- // increment blocks
- extra_blocks += hn;
-
- // how many nodes trail the head?
- meta.out.nodes = hn - 1;
-
- // update blocks
- meta.out.blocks += extra_blocks;
-
-#if 0
- printf("%3u - %5u, %5u, %5u, %5u\n",
- gid,
- meta.out.blocks,
- meta.out.offset,
- meta.out.nodes,
- meta.out.keys);
-#endif
- }
-
- //
- // allocate blocks from block pool
- //
- // first perform a prefix sum on the subgroup to reduce atomic
- // operation traffic
- //
- // note this idiom can be implemented with vectors, subgroups or
- // workgroups
- //
-
- skc_uint const prefix = SKC_RASTERS_ALLOC_INCLUSIVE_ADD(extra_blocks);
- skc_uint reads = 0;
-
- // last lane performs the block pool allocation with an atomic increment
- if (SKC_RASTERS_ALLOC_LOCAL_ID() == SKC_RASTERS_ALLOC_GROUP_SIZE - 1) {
- reads = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,prefix); // ring_reads
- }
-
- // broadcast block pool base to all lanes
- reads = SKC_RASTERS_ALLOC_BROADCAST(reads,SKC_RASTERS_ALLOC_GROUP_SIZE - 1);
-
- // update base for each lane
- reads += prefix - extra_blocks;
-
- //
- // store meta header
- //
- if (is_active)
- {
- // store headers back to meta extent
- vstore4(meta.out.u32v4,gid,metas);
-
- // store reads
- metas[SKC_RASTER_COHORT_META_OFFSET_READS + gid] = reads;
-
- // get block_id of each raster head
- skc_block_id_t const block_id = bp_ids[reads & bp_mask];
-
- // update map
- map[raster_id] = block_id;
-
-#if 0
- printf("alloc: %u / %u\n",raster_id,block_id);
-#endif
- }
-}
-
-//
-//
-//
+/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "tile.h" +#include "raster.h" +#include "atomic_cl.h" +#include "block_pool_cl.h" +#include "raster_builder_cl_12.h" +#include "kernel_cl_12.h" + +// +// There is a fixed-size meta table per raster cohort that we use to +// peform a mostly coalesced sizing and allocation of blocks. +// +// This code is simple and fast. +// + +__kernel +SKC_RASTERS_ALLOC_KERNEL_ATTRIBS +void +skc_kernel_rasters_alloc(__global SKC_ATOMIC_UINT volatile * const bp_atomics, + __global skc_block_id_t const * const bp_ids, + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t * const map, + __global skc_uint * const metas, + __global skc_uint const * const raster_ids, // FIXME -- CONSTANT + skc_uint const count) +{ + // access to the meta extent is linear + skc_uint const gid = get_global_id(0); + skc_bool const is_active = gid < count; + + // + // init with defaults for all lanes + // + union skc_raster_cohort_meta_inout meta = { .in.u32v4 = { 0, 0, 0, 0 } }; + skc_uint raster_id = SKC_UINT_MAX; + skc_uint extra_blocks = 0; + + if (is_active) + { + // load meta_in + meta.in.u32v4 = vload4(gid,metas); + + // load raster_id as early as possible + raster_id = raster_ids[gid]; + +#if 0 + printf("%3u + %5u, %5u, %5u, %5u\n", + gid, + meta.in.blocks, + meta.in.offset, + meta.in.pk, + meta.in.rk); +#endif + + // how many blocks will the ttpb blocks consume? + extra_blocks = ((meta.in.pk * SKC_TILE_RATIO + SKC_DEVICE_SUBBLOCKS_PER_BLOCK - SKC_TILE_RATIO) / + SKC_DEVICE_SUBBLOCKS_PER_BLOCK); + + // total keys + meta.out.keys += meta.in.pk; + + // how many blocks do we need to store the keys in the head and trailing nodes? + skc_uint const hn = ((SKC_RASTER_HEAD_DWORDS + meta.out.keys + SKC_RASTER_NODE_DWORDS - 2) / + (SKC_RASTER_NODE_DWORDS - 1)); + // increment blocks + extra_blocks += hn; + + // how many nodes trail the head? + meta.out.nodes = hn - 1; + + // update blocks + meta.out.blocks += extra_blocks; + +#if 0 + printf("%3u - %5u, %5u, %5u, %5u\n", + gid, + meta.out.blocks, + meta.out.offset, + meta.out.nodes, + meta.out.keys); +#endif + } + + // + // allocate blocks from block pool + // + // first perform a prefix sum on the subgroup to reduce atomic + // operation traffic + // + // note this idiom can be implemented with vectors, subgroups or + // workgroups + // + + skc_uint const prefix = SKC_RASTERS_ALLOC_INCLUSIVE_ADD(extra_blocks); + skc_uint reads = 0; + + // last lane performs the block pool allocation with an atomic increment + if (SKC_RASTERS_ALLOC_LOCAL_ID() == SKC_RASTERS_ALLOC_GROUP_SIZE - 1) { + reads = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,prefix); // ring_reads + } + + // broadcast block pool base to all lanes + reads = SKC_RASTERS_ALLOC_BROADCAST(reads,SKC_RASTERS_ALLOC_GROUP_SIZE - 1); + + // update base for each lane + reads += prefix - extra_blocks; + + // + // store meta header + // + if (is_active) + { + // store headers back to meta extent + vstore4(meta.out.u32v4,gid,metas); + + // store reads + metas[SKC_RASTER_COHORT_META_OFFSET_READS + gid] = reads; + + // get block_id of each raster head + skc_block_id_t const block_id = bp_ids[reads & bp_mask]; + + // update map + map[raster_id] = block_id; + +#if 0 + printf("alloc: %u / %u\n",raster_id,block_id); +#endif + } +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl b/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl index 27411cfe96..b0eb7ea7ae 100644 --- a/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl +++ b/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl @@ -1,442 +1,442 @@ -/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "tile.h"
-#include "block.h"
-#include "raster.h"
-#include "common.h"
-#include "atomic_cl.h"
-#include "block_pool_cl.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
-
-#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS)
-
-#define SKC_RASTERS_RECLAIM_X (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS)
-
-//
-//
-//
-
-#if ( SKC_RASTERS_RECLAIM_X == 1 )
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 0
-
-#elif ( SKC_RASTERS_RECLAIM_X == 2 )
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 1
-
-#elif ( SKC_RASTERS_RECLAIM_X == 4 )
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 3
-
-#elif ( SKC_RASTERS_RECLAIM_X == 8 )
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 7
-
-#elif ( SKC_RASTERS_RECLAIM_X == 16)
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 15
-
-#else
-#error "MISSING SKC_RASTERS_RECLAIM_X"
-#endif
-
-#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE )
-
-#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1)
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \
- (I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
-
-#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
-
-#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
-#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO)
-
-#endif
-
-//
-// FIXME -- slate these for replacement
-//
-
-#define SKC_BROADCAST(E,S,I) \
- sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_BROADCAST_LAST_HELPER(E,I) \
- sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
-
-#define SKC_BROADCAST_LAST(E,I) \
- SKC_BROADCAST_LAST_HELPER(E,I)
-
-//
-// COMPILE-TIME PREDICATES
-//
-
-#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I) \
- SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I) \
- (skc_bool)SKC_GTE_MACRO(X, I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \
- (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I) \
- SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I)
-
-#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I) \
- SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I)
-
-//
-// RUN-TIME PREDICATES
-//
-
-#define SKC_RASTERS_RECLAIM_IS_HEADER(I) \
- (get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS)
-
-//
-// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
-// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
-// COMBOS (NOT NECESSARILY POW2)
-//
-// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
-// UINT TYPE INSTEAD OF A ULONG.
-//
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE skc_uint
-
-//
-//
-//
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS)
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \
- (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \
- ? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I))
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \
- S = sub_group_scan_exclusive_add(C)
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I) \
- (((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK)
-
-//
-//
-//
-
-struct skc_reclaim
-{
- skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE];
-};
-
-__kernel
-SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS
-void
-skc_kernel_rasters_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring
- __global skc_uint * const bp_elems, // block pool blocks
- __global skc_uint volatile * const bp_atomics, // read/write atomics
- skc_uint const bp_mask, // pow2 modulo mask for block pool ring
- __global skc_block_id_t const * const map, // raster host-to-device map
- struct skc_reclaim const reclaim) // array of host raster ids
-{
-#if (__OPENCL_VERSION__ < 200)
- skc_uint const reclaim_stride = get_num_sub_groups();
-#else
- skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
-#endif
- skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id();
-
-#if 0
- //
- // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
- // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
- // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
- // RECLAMATION JOB ON THE REST OF THE PIPELINE.
- //
- for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
-#endif
- {
- // get host raster id
- skc_raster_h const raster = reclaim.aN[reclaim_idx];
-
- // get block id of raster header
- skc_block_id_t id = map[raster];
-
- //
- // load all of the head block ttxk.lo keys into registers
- //
- // FIXME -- this pattern lends itself to using the higher
- // performance Intel GEN block load instructions
- //
- skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
- //
- // pick out count.nodes and count.prims from the header
- //
- // load raster header counts -- we only need the blocks and
- // nodes words the keys are doublewords.
- //
- // FIXME -- this can be made portable with compile-time macro expansion
- //
- skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
- skc_uint count_nodes = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
-
-#if 0
- if (get_sub_group_local_id() == 0) {
- printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes);
- }
-#endif
- //
- // acquire a span in the block pool ids ring for reclaimed ids
- //
- skc_uint bp_ids_base = 0;
-
- if (get_sub_group_local_id() == 0) {
- bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
- }
-
- bp_ids_base = sub_group_broadcast(bp_ids_base,0);
-
- //
- // mask off everything but the block id
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
- h##I = h##I & SKC_TTXK_LO_MASK_ID; \
- }
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
- //
- // swap current id with next
- //
- if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
- {
- skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
-
- SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
-
- id = next;
-#if 0
- printf("rasters next = %u\n",id);
-#endif
- }
-
-#if 0
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- printf("%08X %u\n",h##I,h##I);
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-#endif
-
-#if 0
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
- printf("%08X\n",h##I); \
- }
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-#endif
-
- //
- // - we'll skip subgroups that are entirely header
- //
- // - but we need to mark any header elements that partially fill
- // a subgroup as subblocks
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
- if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) { \
- if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) { \
- h##I = SKC_UINT_MAX; \
- } \
- } \
- }
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
- {
- //
- // count reclaimable blocks in each lane
- //
- SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
- packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
- }
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
- //
- // scan to find index of each block
- //
- SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
-
- SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
-
- //
- // store blocks back to ring
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
- skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
- skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
- skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
- if (count > 0) { \
- bp_ids[bp_ids_idx] = h##I; \
- } \
- skc_uint const total = index + count; \
- bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
- }
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
- }
-
- // printf("R %7u ! %u\n",bp_ids_idx,h##I);
-
- //
- // we're done if it was just the header
- //
- if (count_nodes == 0)
- return;
-
- //
- // otherwise, walk the nodes
- //
- do {
- // id of next block is in last lane
- id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1);
-
- //
- // load all of the node block ttxk.lo keys into registers
- //
- // FIXME -- this pattern lends itself to using the higher
- // performance Intel GEN block load instructions
- //
- skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
- //
- // mask off everything but the block id
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- n##I = n##I & SKC_TTXK_LO_MASK_ID;
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
- //
- // swap current id with next
- //
- if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
- {
- skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
-
- SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
-
- id = next;
-#if 0
- printf("rasters next = %u\n",id);
-#endif
- }
-
-#if 0
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- printf("%08X %u\n",n##I,n##I);
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-#endif
-
- //
- // count reclaimable blocks in each lane
- //
- SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
- //
- // scan to find index of each block
- //
- SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
-
- SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
-
- //
- // store blocks back to ring
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) { \
- skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
- skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
- skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
- if (count > 0) { \
- bp_ids[bp_ids_idx] = n##I; \
- } \
- skc_uint const total = index + count; \
- bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
- }
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
- // printf("R %7u ! %u\n",bp_ids_idx,n##I);
-
- // any more nodes?
- } while (--count_nodes > 0);
- }
-}
-
-//
-//
-//
+/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "tile.h" +#include "block.h" +#include "raster.h" +#include "common.h" +#include "atomic_cl.h" +#include "block_pool_cl.h" +#include "kernel_cl_12.h" + +// +// +// + +#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) + +#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS) + +#define SKC_RASTERS_RECLAIM_X (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS) + +// +// +// + +#if ( SKC_RASTERS_RECLAIM_X == 1 ) +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1() +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 0 + +#elif ( SKC_RASTERS_RECLAIM_X == 2 ) +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2() +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 1 + +#elif ( SKC_RASTERS_RECLAIM_X == 4 ) +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4() +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 3 + +#elif ( SKC_RASTERS_RECLAIM_X == 8 ) +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8() +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 7 + +#elif ( SKC_RASTERS_RECLAIM_X == 16) +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16() +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 15 + +#else +#error "MISSING SKC_RASTERS_RECLAIM_X" +#endif + +#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) + +#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L) +#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) +#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) + +#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1 + +#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) +#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) +#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \ + (I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK)) + +#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L) +#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) +#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) + +#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1 + +#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE) +#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask + +#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK)) +#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) +#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) + +#endif + +// +// FIXME -- slate these for replacement +// + +#define SKC_BROADCAST(E,S,I) \ + sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) + +#define SKC_BROADCAST_LAST_HELPER(E,I) \ + sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) + +#define SKC_BROADCAST_LAST(E,I) \ + SKC_BROADCAST_LAST_HELPER(E,I) + +// +// COMPILE-TIME PREDICATES +// + +#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I) \ + SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) + +#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I) \ + (skc_bool)SKC_GTE_MACRO(X, I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \ + (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) + +#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I) \ + SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I) + +#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I) \ + SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I) + +// +// RUN-TIME PREDICATES +// + +#define SKC_RASTERS_RECLAIM_IS_HEADER(I) \ + (get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS) + +// +// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL +// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK +// COMBOS (NOT NECESSARILY POW2) +// +// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR +// UINT TYPE INSTEAD OF A ULONG. +// + +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2 +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE skc_uint + +// +// +// + +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS) + +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \ + (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \ + ? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) + +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \ + S = sub_group_scan_exclusive_add(C) + +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I) \ + (((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK) + +// +// +// + +struct skc_reclaim +{ + skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE]; +}; + +__kernel +SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS +void +skc_kernel_rasters_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring + __global skc_uint * const bp_elems, // block pool blocks + __global skc_uint volatile * const bp_atomics, // read/write atomics + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const map, // raster host-to-device map + struct skc_reclaim const reclaim) // array of host raster ids +{ +#if (__OPENCL_VERSION__ < 200) + skc_uint const reclaim_stride = get_num_sub_groups(); +#else + skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups +#endif + skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id(); + +#if 0 + // + // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT + // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL + // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE + // RECLAMATION JOB ON THE REST OF THE PIPELINE. + // + for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride) +#endif + { + // get host raster id + skc_raster_h const raster = reclaim.aN[reclaim_idx]; + + // get block id of raster header + skc_block_id_t id = map[raster]; + + // + // load all of the head block ttxk.lo keys into registers + // + // FIXME -- this pattern lends itself to using the higher + // performance Intel GEN block load instructions + // + skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id()); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)]; + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // pick out count.nodes and count.prims from the header + // + // load raster header counts -- we only need the blocks and + // nodes words the keys are doublewords. + // + // FIXME -- this can be made portable with compile-time macro expansion + // + skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES + skc_uint count_nodes = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS + +#if 0 + if (get_sub_group_local_id() == 0) { + printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes); + } +#endif + // + // acquire a span in the block pool ids ring for reclaimed ids + // + skc_uint bp_ids_base = 0; + + if (get_sub_group_local_id() == 0) { + bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks); + } + + bp_ids_base = sub_group_broadcast(bp_ids_base,0); + + // + // mask off everything but the block id + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ + h##I = h##I & SKC_TTXK_LO_MASK_ID; \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // swap current id with next + // + if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) + { + skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST); + + SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; + + id = next; +#if 0 + printf("rasters next = %u\n",id); +#endif + } + +#if 0 +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + printf("%08X %u\n",h##I,h##I); + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); +#endif + +#if 0 +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ + printf("%08X\n",h##I); \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); +#endif + + // + // - we'll skip subgroups that are entirely header + // + // - but we need to mark any header elements that partially fill + // a subgroup as subblocks + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ + if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) { \ + if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) { \ + h##I = SKC_UINT_MAX; \ + } \ + } \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + { + // + // count reclaimable blocks in each lane + // + SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ + packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // scan to find index of each block + // + SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); + + SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); + + // + // store blocks back to ring + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ + skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ + skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ + skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ + if (count > 0) { \ + bp_ids[bp_ids_idx] = h##I; \ + } \ + skc_uint const total = index + count; \ + bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + } + + // printf("R %7u ! %u\n",bp_ids_idx,h##I); + + // + // we're done if it was just the header + // + if (count_nodes == 0) + return; + + // + // otherwise, walk the nodes + // + do { + // id of next block is in last lane + id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); + + // + // load all of the node block ttxk.lo keys into registers + // + // FIXME -- this pattern lends itself to using the higher + // performance Intel GEN block load instructions + // + skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id()); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)]; + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // mask off everything but the block id + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + n##I = n##I & SKC_TTXK_LO_MASK_ID; + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // swap current id with next + // + if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) + { + skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST); + + SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; + + id = next; +#if 0 + printf("rasters next = %u\n",id); +#endif + } + +#if 0 +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + printf("%08X %u\n",n##I,n##I); + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); +#endif + + // + // count reclaimable blocks in each lane + // + SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I); + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // scan to find index of each block + // + SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); + + SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); + + // + // store blocks back to ring + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) { \ + skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ + skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ + skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ + if (count > 0) { \ + bp_ids[bp_ids_idx] = n##I; \ + } \ + skc_uint const total = index + count; \ + bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // printf("R %7u ! %u\n",bp_ids_idx,n##I); + + // any more nodes? + } while (--count_nodes > 0); + } +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/render.cl b/src/compute/skc/platforms/cl_12/kernels/render.cl index 9205334940..a7b32299c9 100644 --- a/src/compute/skc/platforms/cl_12/kernels/render.cl +++ b/src/compute/skc/platforms/cl_12/kernels/render.cl @@ -1,2165 +1,2165 @@ -/*
- * Copyright 2016 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "tile.h"
-#include "block.h"
-#include "styling_types.h"
-#include "atomic_cl.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define SKC_RENDER_SUBGROUP_MASK (SKC_RENDER_SUBGROUP_SIZE - 1)
-
-//
-//
-//
-
-#if ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_1()
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 0
-
-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 2 )
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_2()
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 1
-
-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 4 )
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_4()
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 3
-
-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 8 )
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_8()
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 7
-
-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 16)
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_16()
-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 15
-#endif
-
-//
-// tile state flag bits
-//
-
-typedef enum skc_tile_flags_e {
-
- // FLUSH
- SKC_TILE_FLAGS_FLUSH_FINALIZE = 0x00000001,
- SKC_TILE_FLAGS_FLUSH_UNWIND = 0x00000002,
- SKC_TILE_FLAGS_FLUSH_COMPLETE = 0x00000004,
-
- // OPACITY
- SKC_TILE_FLAGS_SCATTER_SKIP = 0x00000008,
-
- //
- // Note: testing for opacity and skipping scattering is on its way
- // to becoming a much more programmable option because sometimes we
- // may be compositing/blending from back-to-front and/or be using
- // group blend rules that ignore opacity.
- //
- // The point is that all of these decisions should be encoded in
- // styling commands and, as much as possible, removed from the final
- // group/layer styling traversal render loop.
- //
-
-} skc_tile_flags_e;
-
-//
-// COVER -- assumes availability of either fp16 or fp32
-//
-
-union skc_tile_cover
-{
- struct {
- SKC_RENDER_TILE_COVER c[SKC_TILE_WIDTH];
- } aN;
-
-#ifdef SKC_RENDER_TILE_COVER_VECTOR
- struct {
- SKC_RENDER_TILE_COVER_VECTOR c[SKC_RENDER_TILE_COVER_VECTOR_COUNT];
- } vN;
-#endif
-};
-
-//
-// COLOR -- assumes availability of either fp16 or fp32
-//
-
-union skc_tile_color
-{
- union {
- struct {
- SKC_RENDER_TILE_COLOR r;
- SKC_RENDER_TILE_COLOR g;
- SKC_RENDER_TILE_COLOR b;
- SKC_RENDER_TILE_COLOR a;
- } rgba[SKC_TILE_WIDTH];
- } aN;
-
-#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
- union {
- SKC_RENDER_TILE_COLOR_INTERLEAVED rgba[SKC_TILE_WIDTH];
- } iN;
-#endif
-
-#ifdef SKC_RENDER_TILE_COLOR_VECTOR
- union {
- SKC_RENDER_TILE_COLOR_VECTOR rgba[SKC_RENDER_TILE_COLOR_VECTOR_COUNT];
- } vN;
-#endif
-
- struct {
- union {
- struct {
- SKC_RENDER_TILE_COLOR r;
- SKC_RENDER_TILE_COLOR g;
- };
- SKC_RENDER_GRADIENT_FLOAT distance;
- };
- union {
- struct {
- SKC_RENDER_TILE_COLOR b;
- SKC_RENDER_TILE_COLOR a;
- };
- SKC_RENDER_GRADIENT_FLOAT stoplerp;
- };
- } grad[SKC_TILE_WIDTH];
-};
-
-//
-// SHARED MEMORY STATE
-//
-
-#define SKC_RENDER_TILE_SMEM_WORDS ((SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT)
-
-#define SKC_RENDER_WIDE_AA_BYTES (SKC_RENDER_TILE_SMEM_WORDS * sizeof(int) / SKC_RENDER_SUBGROUP_SIZE)
-#define SKC_RENDER_WIDE_AA_WIDTH (SKC_RENDER_WIDE_AA_BYTES / sizeof(SKC_RENDER_WIDE_AA))
-
-//
-//
-//
-
-union skc_subgroup_smem
-{
- //
- // The tiles are stored in column-major / height-major order
- //
- // The final column is a guard column that is OK to write to but
- // will never be read. It simplifies the TTSB scatter but could be
- // predicated if SMEM is really at a premium.
- //
-#if ( SKC_RENDER_SUBGROUP_SIZE > 1 )
- struct {
- SKC_ATOMIC_UINT area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
- } atomic;
-#endif
-
- struct {
- int area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
- } aN;
-
- struct { // assumption is that height = subgroup
- SKC_RENDER_AREA_V area[SKC_TILE_WIDTH + 1][SKC_RENDER_SUBGROUP_SIZE];
- } vN;
-
- struct { // assumption is that height = subgroup
- SKC_RENDER_WIDE_AA area[SKC_RENDER_WIDE_AA_WIDTH][SKC_RENDER_SUBGROUP_SIZE];
- } wide;
-
- union skc_styling_cmd cmds[(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT];
-
- half gc [(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT * 2];
-
-#if 0
- //
- // SPILL TO GMEM
- //
-#if (SKC_REGS_COLOR_S > 0) || (SKC_REGS_COVER_S > 0)
- struct {
-
-#if (SKC_REGS_COLOR_S > 0)
- union skc_color_r color[SKC_REGS_COLOR_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
-#endif
-
-#if (SKC_REGS_COVER_S > 0)
- union float cover[SKC_REGS_COVER_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
-#endif
-
- } regs;
-#endif
- //
- //
- //
-#endif
-};
-
-//
-//
-//
-
-#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
-
-#define skc_subgroup_lane() 0
-
-#else
-
-#define skc_subgroup_lane() get_sub_group_local_id()
-
-#endif
-
-//
-//
-//
-
-typedef skc_uint skc_ttsk_lo_t;
-typedef skc_uint skc_ttsk_hi_t;
-
-typedef skc_uint skc_ttpk_lo_t;
-typedef skc_uint skc_ttpk_hi_t;
-
-typedef skc_uint skc_ttxk_lo_t;
-typedef skc_uint skc_ttxk_hi_t;
-
-typedef skc_uint skc_ttck_lo_t;
-typedef skc_uint skc_ttck_hi_t;
-
-typedef skc_uint2 skc_ttck_t;
-
-typedef skc_int skc_ttxb_t;
-
-//
-// TTCK (32-BIT COMPARE) v1:
-//
-// 0 63
-// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
-// +----------------------+--------+--------+-------+-----+-----+
-// | 30 | 1 | 1 | 18 | 7 | 7 |
-//
-//
-// TTCK (32-BIT COMPARE) v2:
-//
-// 0 63
-// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
-// +----------------------+--------+--------+-------+-----+-----+
-// | 30 | 1 | 1 | 15 | 9 | 8 |
-//
-//
-// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
-//
-// 0 63
-// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
-// +----------------------+--------+--------+-------+-----+-----+
-// | 27 | 1 | 1 | 18 | 9 | 8 |
-//
-
-static
-skc_uint
-skc_ttck_lo_get_ttxb_id(skc_ttck_lo_t const a)
-{
- return a & SKC_TTCK_LO_MASK_ID;
-}
-
-static
-skc_layer_id
-skc_ttck_get_layer(skc_ttck_t const a)
-{
- //
- // FIXME -- a union with a ulong and a shift down and mask is
- // probably faster on some architectures
- //
- skc_uint const lo = (a.lo >> SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
- skc_uint const hi = (a.hi & SKC_TTCK_HI_MASK_LAYER) << SKC_TTCK_LO_BITS_LAYER;
-
- return lo | hi;
-}
-
-static
-skc_uint
-skc_ttck_hi_get_x(skc_ttck_hi_t const a)
-{
- return SKC_BFE(a,SKC_TTCK_HI_BITS_X,SKC_TTCK_HI_OFFSET_X);
-}
-
-static
-skc_uint
-skc_ttck_hi_get_y(skc_ttck_hi_t const a)
-{
- return a >> SKC_TTCK_HI_OFFSET_Y;
-}
-
-static
-skc_bool
-skc_ttck_equal_yxl(skc_ttck_t const a, skc_ttck_t const b)
-{
- skc_uint const lo = (a.lo ^ b.lo) & SKC_BITS_TO_MASK_AT(SKC_TTCK_LO_BITS_LAYER,SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
- skc_uint const hi = (a.hi ^ b.hi);
-
- return (lo | hi) == 0;
-}
-
-static
-skc_bool
-skc_ttck_hi_equal_yx(skc_ttck_hi_t const a, skc_ttck_hi_t const b)
-{
- return ((a ^ b) & SKC_TTCK_HI_MASK_YX) == 0;
-}
-
-static
-skc_bool
-skc_ttck_lo_is_prefix(skc_ttck_lo_t const a)
-{
- return (a & SKC_TTCK_LO_MASK_PREFIX) != 0;
-}
-
-//
-// TILE TRACE SUBPIXEL
-//
-// The subpixels are encoded with either absolute tile coordinates
-// (32-bits) or packed in delta-encoded form form.
-//
-// For 32-bit subpixel packing of a 32x32 tile:
-//
-// A tile X is encoded as:
-//
-// TX : 10 : unsigned min(x0,x1) tile subpixel coordinate.
-//
-// SX : 6 : unsigned subpixel span from min to max x with range
-// [0,32]. The original direction is not captured. Would
-// be nice to capture dx but not necessary right now but
-// could be in the future. <--- SPARE VALUES AVAILABLE
-//
-// A tile Y is encoded as:
-//
-// TY : 10 : unsigned min(y0,y1) tile subpixel coordinate.
-//
-// DY : 6 : signed subpixel delta y1-y0. The range of delta is
-// [-32,32] but horizontal lines are not encoded so [1,32]
-// is mapped to [0,31]. The resulting range [-32,31] fits
-// in 6 bits.
-//
-// TTS:
-//
-// 0 31
-// | TX | SX | TY | DY |
-// +-----+------+-----+------+
-// | 10 | 6 | 10 | 6 |
-//
-
-static
-SKC_RENDER_TTS_V_BITFIELD
-skc_tts_get_ty_pixel_v(SKC_RENDER_TTS_V const a)
-{
- //
- // extract the whole pixel y coordinate
- //
- return SKC_BFE(a,
- SKC_TTS_BITS_TY - SKC_SUBPIXEL_RESL_Y_LOG2,
- SKC_TTS_OFFSET_TY + SKC_SUBPIXEL_RESL_Y_LOG2);
-}
-
-static
-SKC_RENDER_TTS_V_BITFIELD
-skc_tts_get_xy_idx_v(SKC_RENDER_TTS_V const a)
-{
- //
- // get the linear array tile index of the pixel
- //
- return (((a & SKC_TTS_MASK_TX_PIXEL)
-
-#if (SKC_SUBPIXEL_RESL_X_LOG2 > SKC_TILE_HEIGHT_LOG2)
- >> (SKC_SUBPIXEL_RESL_X_LOG2 - SKC_TILE_HEIGHT_LOG2)
-#elif (SKC_SUBPIXEL_RESL_X_LOG2 < SKC_TILE_HEIGHT_LOG2)
- << (SKC_TILE_HEIGHT_LOG2 - SKC_SUBPIXEL_RESL_X_LOG2)
-#endif
-
- ) | skc_tts_get_ty_pixel_v(a));
-}
-
-#if 0
-static
-skc_ttx_v_s32_t
-skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
-{
- skc_ttx_v_s32_t const dy = SKC_AS(skc_ttx_v_s32_t)a >> SKC_TTS_OFFSET_DY;
-
- return (dy + SKC_AS(skc_ttx_v_s32_t)(~a >> 31));
-}
-#else
-static
-SKC_RENDER_TTS_V_BITFIELD
-skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
-{
- SKC_RENDER_TTS_V_BITFIELD const dy = a >> SKC_TTS_OFFSET_DY;
-
- return dy - (~a >> 31);
-}
-#endif
-
-static
-SKC_RENDER_TTS_V_BITFIELD
-skc_tts_get_tx_subpixel_v(SKC_RENDER_TTS_V const a)
-{
- return a & SKC_BITS_TO_MASK(SKC_SUBPIXEL_RESL_X_LOG2);
-}
-
-static
-SKC_RENDER_TTS_V_BITFIELD
-skc_tts_get_sx_v(SKC_RENDER_TTS_V const a)
-{
- return SKC_BFE(a,SKC_TTS_BITS_SX,SKC_TTS_OFFSET_SX);
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_aa_zero(__local union skc_subgroup_smem * SKC_RESTRICT const smem)
-{
- //
- // SIMD / CPU
- //
- // &
- //
- // SIMT / GPU
- //
- // Note that atomic_init() is likely implemented as a simple
- // assignment so there is no identifiable performance difference on
- // current targets.
- //
- // If such an architecture appears in the future then we'll probably
- // still want to implement this zero'ing operation as below but
- // follow with an appropriate fence that occurs before any scatter
- // operations.
- //
- // The baroque expansion below improves performance on Intel GEN by,
- // presumably, achieving the 64-byte per clock SLM write as well as
- // minimizing the overall number of SEND() block initializations and
- // launches.
- //
- // Intel GENx has a documented 64 byte per cycle SLM write limit.
- // So having each lane in an 8 lane subgroup zero-write 8 bytes is
- // probably a safe bet (Later: benchmarking backs this up!).
- //
- // Note there is no reason at this time to unroll this loop.
- //
- for (uint ii=0; ii<SKC_RENDER_WIDE_AA_WIDTH; ii++)
- smem->wide.area[ii][skc_subgroup_lane()] = ( 0 );
-}
-
-//
-// Note this is going to be vectorizable on most architectures.
-//
-// The return of the key translation feature might complicate things.
-//
-
-static
-void
-skc_scatter_ttpb(__global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent,
- __local union skc_subgroup_smem * SKC_RESTRICT const smem,
- skc_block_id_t const pb_id)
-{
- skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();
-
-#if ( SKC_TILE_RATIO == 1 )
-
- SKC_RENDER_TTP_V const ttp_v = ttxb_extent[offset];
-
-#elif ( SKC_TILE_RATIO == 2 )
-
- SKC_RENDER_TTP_V const ttp_v = vload2(offset,ttxb_extent);
-
-#else
-
-#error("tile ratio greater than 2 not supported")
-
-#endif
-
- //
- // Note there is no need to use an atomic for this operation on the
- // current group of target platforms... but this may change if
- // atomic ops truly go through a different path.
- //
- // As noted above, this direct increment is probably faster and can
- // always be followed by a fence.
- //
- // Furthermore, note that the key sorting orders all ttck keys
- // before ttpk keys.
- //
-
- //
- // FIXME -- if the SMEM store is wider than bank word count then we
- // might want to odd-even interleave the TTP values if the target
- // device can't handle 64-bit stores
- //
-
- //
- // skipping per-key translation for now
- //
- smem->vN.area[0][skc_subgroup_lane()] += ttp_v << (SKC_SUBPIXEL_RESL_X_LOG2 + 1);
-}
-
-//
-// Note that skc_scatter_ttsb is *not* vectorizable unless the
-// architecture supports a "scatter-add" capability. All relevant
-// GPUs support atomic add on shared/local memory and thus support
-// scatter-add.
-//
-
-static
-void
-skc_scatter_ttsb(__global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent,
- __local union skc_subgroup_smem * SKC_RESTRICT const smem,
- skc_block_id_t const sb_id)
-{
- skc_uint const offset = sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
-
- SKC_RENDER_TTS_V const tts_v = ttxb_extent[offset];
-
- //
- // Skipping per-key translation for now
- //
-
- // Index into tile
- //
- // The tiles are stored in column-major / height-major order
- //
- // The final column is a guard column that is OK to write to but
- // will never be read. It simplifies the TTSB scatter but could be
- // predicated if SMEM is really at a premium.
- //
-
- SKC_RENDER_TTS_V_BITFIELD const xy_idx = skc_tts_get_xy_idx_v(tts_v);
-
-#if 0
- if (tts_v != SKC_TTS_INVALID)
- printf("(%08X) = %u\n",tts_v,xy_idx);
-#endif
-
- //
- // adjust subpixel range to max y
- //
- // range is stored as [-32,31] and when read [0,31] is mapped to
- // [1,32] because a dy of 0 is not possible.
- //
- // more succinctly: if dy >= 0 then ++dy
- //
- SKC_RENDER_TTS_V_BITFIELD const dy = skc_tts_get_dy_v(tts_v);
-
- //
- // FIXME -- benchmark performance of setting dy to 0 if ttsv.vN is invalid?
- //
-
- // this "min(x0) * 2 + dx" is equivalent to "x0 + x1"
- SKC_RENDER_TTS_V_BITFIELD const widths = skc_tts_get_tx_subpixel_v(tts_v) * 2 + skc_tts_get_sx_v(tts_v);
-
- // Calculate left and right coverage contribution trapezoids
- SKC_RENDER_TTS_V_BITFIELD const left = dy * widths;
- SKC_RENDER_TTS_V_BITFIELD const right = (dy << (SKC_SUBPIXEL_RESL_X_LOG2 + 1)) - left;
-
- //
- // Accumulate altitudes and areas
- //
- // Optimization: if the device supports an CPU/SIMD vector-add or
- // GPU/SIMT scatter-add atomic int2 add operation then placing the
- // ALT and AREA values side-by-side would halve the number of
- // additions.
- //
-#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
- //
- // CPU/SIMD
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) \
- if (tts_v C != SKC_TTS_INVALID) { \
- smem->aN.area[SKC_TILE_HEIGHT + xy_idx C] += left C; \
- smem->aN.area[ xy_idx C] += right C; \
- }
-
-#else
- //
- // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) \
- if (tts_v C != SKC_TTS_INVALID) { \
- SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + \
- SKC_TILE_HEIGHT + xy_idx C, \
- left C); \
- SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + xy_idx C, \
- right C); \
- }
-#endif
-
- SKC_RENDER_TTSB_EXPAND();
-}
-
-//
-// Note that 2048.0 can be represented exactly with fp16... fortuitous!
-//
-
-#define SKC_RENDER_FILL_MAX_AREA (2u * SKC_SUBPIXEL_RESL_X * SKC_SUBPIXEL_RESL_Y)
-#define SKC_RENDER_FILL_MAX_AREA_2 (2u * SKC_RENDER_FILL_MAX_AREA)
-#define SKC_RENDER_FILL_EVEN_ODD_MASK (SKC_RENDER_FILL_MAX_AREA_2 - 1)
-#define SKC_RENDER_FILL_MAX_AREA_RCP_F32 (SKC_RENDER_TILE_COVER)(1.0f / SKC_RENDER_FILL_MAX_AREA)
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_nonzero(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
- union skc_tile_cover * SKC_RESTRICT const cover,
- union skc_tile_color * SKC_RESTRICT const color)
-{
- SKC_RENDER_ACC_COVER_INT area = 0;
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- {
- area += smem->vN.area[ii][skc_subgroup_lane()];
- SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
- SKC_RENDER_TILE_COVER const nonzero = SKC_CONVERT(SKC_RENDER_TILE_COVER)(min(trapabs,SKC_RENDER_FILL_MAX_AREA));
-
- cover->aN.c[ii] = nonzero * (SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA_RCP_F32);
- }
-}
-
-static
-void
-skc_tile_cover_evenodd(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
- union skc_tile_cover * SKC_RESTRICT const cover,
- union skc_tile_color * SKC_RESTRICT const color)
-{
- SKC_RENDER_ACC_COVER_INT area = 0;
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- {
- area += smem->vN.area[ii][skc_subgroup_lane()];
- SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
- SKC_RENDER_ACC_COVER_UINT const reflect = abs(SKC_AS(SKC_RENDER_ACC_COVER_INT)((trapabs & SKC_RENDER_FILL_EVEN_ODD_MASK) - SKC_RENDER_FILL_MAX_AREA));
-
- cover->aN.c[ii] = SKC_CONVERT(SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA - reflect) * (SKC_RENDER_TILE_COVER)SKC_RENDER_FILL_MAX_AREA_RCP_F32;
- }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_color_fill_solid(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
- uint * SKC_RESTRICT const cmd_next,
- union skc_tile_color * SKC_RESTRICT const color)
-{
- //
- // rgba = solid fill
- //
- __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;
-
- *cmd_next += 2;
-
-#if !defined( SKC_RENDER_TILE_COLOR_VECTOR )
-
- SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- color->aN.rgba[ii].r = rg.lo;
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- color->aN.rgba[ii].g = rg.hi;
-
- SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- color->aN.rgba[ii].b = ba.lo;
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- color->aN.rgba[ii].a = ba.hi;
-
-#else
-
- SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
- SKC_RENDER_TILE_COLOR const r = rg.lo;
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
- color->vN.rgba[ii].even.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(r);
-
- SKC_RENDER_TILE_COLOR const g = rg.hi;
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
- color->vN.rgba[ii].odd.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(g);
-
- SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
- SKC_RENDER_TILE_COLOR const b = ba.lo;
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
- color->vN.rgba[ii].even.odd = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(b);
-
- SKC_RENDER_TILE_COLOR const a = ba.hi;
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
- color->vN.rgba[ii].odd.odd = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(a);
-
-#endif
-}
-
-//
-// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
-//
-// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
-//
-// Lerp in two fma/mad ops:
-//
-// t * b + ((-t) * a + a)
-//
-// Note: OpenCL documents mix() as being implemented as:
-//
-// a + (b - a) * t
-//
-// But this may be a native instruction on some devices. For example,
-// on GEN9 there is an LRP "linear interoplation" function but it
-// doesn't appear to support half floats.
-//
-
-#if 1
-#define SKC_LERP(a,b,t) mad(t,b,mad(-(t),a,a))
-#else
-#define SKC_LERP(a,b,t) mix(a,b,t)
-#endif
-
-//
-// CPUs have a mock local address space so copying the gradient header
-// is probably not useful. Just read directly from global.
-//
-
-#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
-#define SKC_RENDER_GRADIENT_SPACE __local
-#else
-#define SKC_RENDER_GRADIENT_SPACE __global
-#endif
-
-//
-// gradient is non-vertical
-//
-// removed the vertical (actually, horizontal) special case
-//
-
-static
-void
-skc_tile_color_fill_gradient_linear_nonvertical(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
- __global union skc_styling_cmd const * SKC_RESTRICT const commands,
- uint * SKC_RESTRICT const cmd_next,
- union skc_tile_color * SKC_RESTRICT const color,
- skc_ttck_hi_t const ttck_hi)
-{
- //
- // Where is this tile?
- //
- // Note that the gradient is being sampled from pixel centers.
- //
- SKC_RENDER_GRADIENT_FLOAT const y =
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) I##.5f P
- (SKC_RENDER_GRADIENT_FLOAT)( SKC_RENDER_SCANLINE_VECTOR_EXPAND() ) +
- (skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE));
-
- float const x = 0.5f + (skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH);
-
- //
- // Get starting numerator and denominator
- //
- // Note: if gh[0].dx is exactly 0.0f then this is a vertical
- // gradient and can be handled by a special opcode.
- //
- // Note: the mad() ordering is slightly different than the original
- // CUDA implementation.
- //
- union skc_gradient_vector const gv = { vload4(0,&commands[*cmd_next].f32) };
-
- *cmd_next += 4;
-
- float const gv_x_dot = mad(x,gv.dx,gv.p0);
- SKC_RENDER_GRADIENT_FLOAT const gv_numer = mad(y,gv.dy,gv_x_dot);
-
- //
- // Where are columns along gradient vector?
- //
- // TODO: Note that the gv_denom isn't multiplied through.
- //
- // Please doublecheck this... but I recall that in certain cases
- // this wipes out some precision and results in minor but noticeable
- // gradient artifacts.
- //
- // All arguments are scalars except gv_numer so a simpler
- // evaluation might save some flops.
- //
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- color->grad[ii].distance = mad(gv.dx,(float)ii,gv_numer) * gv.denom;
-
- //
- // is gradient non-repeating, repeating or reflecting?
- //
- switch (commands[(*cmd_next)++].u32)
- {
- case SKC_STYLING_GRADIENT_TYPE_LINEAR_NON_REPEATING:
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- color->grad[ii].distance = clamp(color->grad[ii].distance,0.0f,1.0f);
- break;
-
- case SKC_STYLING_GRADIENT_TYPE_LINEAR_REPEATING:
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- color->grad[ii].distance -= floor(color->grad[ii].distance);
- break;
-
- default: // PXL_STYLING_GRADIENT_TYPE_LINEAR_REFLECTING
- //
- // OPTIMIZATION: Can this be done in fewer than ~4 ops?
- //
- // Note: OpenCL "rint()" is round-to-nearest-even integer!
- //
- // Note: the floor() "round to -inf" op is implemented in the
- // GEN op 'FRC' so probably don't use trunc() when floor will
- // suffice.
- //
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- {
- SKC_RENDER_GRADIENT_FLOAT dist_abs = fabs(color->grad[ii].distance);
- color->grad[ii].distance = fabs(dist_abs - rint(dist_abs));
- }
- }
-
- //
- // initialize "stoplerp" for all columns
- //
- uint const slope_count = commands[(*cmd_next)++].u32;
- uint const gd_n_v1 = commands[(*cmd_next)++].u32; // REMOVE ME
-
- {
- float const slope = commands[(*cmd_next)++].f32;
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- color->grad[ii].stoplerp = color->grad[ii].distance * slope;
- }
-
- //
- // compute stoplerp for remaining stops
- //
- for (int jj=1; jj<slope_count; jj++)
- {
- float const floor = (float)jj;
- float const slope = commands[(*cmd_next)++].f32;
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- color->grad[ii].stoplerp = mad(min(0, color->grad[ii].stoplerp - floor),slope,color->grad[ii].stoplerp);
- }
-
- //
- // copy gradient colors to local memory
- //
- uint const gd_n = slope_count + 1;
-
-#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
- //
- // copy entire gradient descriptor to local memory
- //
- for (uint ii=skc_subgroup_lane(); ii<gd_n*4; ii+=SKC_RENDER_SUBGROUP_SIZE)
- smem->cmds[ii].u32 = commands[*cmd_next + ii].u32;
-
- __local half const * const SKC_RESTRICT gc = smem->gc + 0;
-#else
- //
- // prefetch entire gradient header
- //
- // no noticeable impact on performance
- //
- // prefetch(&commands[*cmd_next].u32,gh_words);
- //
- __global half const * const SKC_RESTRICT gc = commands[*cmd_next].f16a2 + 0;
-#endif
-
- //
- // adjust cmd_next so that V1 structure is consumed -- FIXME
- //
- *cmd_next += SKC_GRADIENT_CMD_WORDS_V2_ADJUST(gd_n_v1,gd_n);
-
- //
- // lerp between color pair stops
- //
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- {
- //
- // Finally, we have the gradient stop index and the color stop
- // pair lerp fraction
- //
- // Note that if these are vector values then a gather operation
- // must occur -- there may be platforms (AVX-512?) that can
- // perform an explicit gather on a vector type but it's not
- // really expressible in OpenCL except implicitly with a
- // workgroup of work items.
- //
- // ***********************
- //
- // FIXME -- USE HERB'S SINGLE FMA LERP
- //
- // ***********************
- //
- SKC_RENDER_GRADIENT_STOP const gc_stop = SKC_CONVERT(SKC_RENDER_GRADIENT_STOP)(color->grad[ii].stoplerp);
- SKC_RENDER_GRADIENT_FRAC const gc_frac = SKC_CONVERT(SKC_RENDER_GRADIENT_FRAC)(color->grad[ii].stoplerp - floor(color->grad[ii].stoplerp));
-
- {
- SKC_RENDER_TILE_COLOR lo, hi;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) { \
- SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + 0,gc); \
- lo C = cc.lo; \
- hi C = cc.hi; \
- }
-
- SKC_RENDER_SCANLINE_VECTOR_EXPAND();
-
- color->aN.rgba[ii].r = SKC_LERP(lo,hi,gc_frac);
- }
-
- //
- //
- //
- {
- SKC_RENDER_TILE_COLOR lo, hi;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) { \
- SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n,gc); \
- lo C = cc.lo; \
- hi C = cc.hi; \
- }
-
- SKC_RENDER_SCANLINE_VECTOR_EXPAND();
-
- color->aN.rgba[ii].g = SKC_LERP(lo,hi,gc_frac);
- }
-
- //
- //
- //
- {
- SKC_RENDER_TILE_COLOR lo, hi;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) { \
- SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*2,gc); \
- lo C = cc.lo; \
- hi C = cc.hi; \
- }
-
- SKC_RENDER_SCANLINE_VECTOR_EXPAND();
-
- color->aN.rgba[ii].b = SKC_LERP(lo,hi,gc_frac);
- }
-
- //
- //
- //
- {
- SKC_RENDER_TILE_COLOR lo, hi;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) { \
- SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*3,gc); \
- lo C = cc.lo; \
- hi C = cc.hi; \
- }
-
- SKC_RENDER_SCANLINE_VECTOR_EXPAND();
-
- color->aN.rgba[ii].a = SKC_LERP(lo,hi,gc_frac);
- }
- }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_blend_over(union skc_tile_color * SKC_RESTRICT const color_acc,
- union skc_tile_cover const * SKC_RESTRICT const cover_wip,
- union skc_tile_color const * SKC_RESTRICT const color_wip)
-{
- //
- // fralunco = cover.wip * acc.a
- //
- // acc.r = fralunco * wip.r + acc.r
- // acc.g = fralunco * wip.g + acc.g
- // acc.b = fralunco * wip.b + acc.b
- // acc.a = -fralunco * wip.a + acc.a
- //
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- {
- SKC_RENDER_TILE_COVER const fralunco = cover_wip->aN.c[ii] * color_acc->aN.rgba[ii].a;
-
- color_acc->aN.rgba[ii].r = mad(+fralunco,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
- color_acc->aN.rgba[ii].g = mad(+fralunco,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
- color_acc->aN.rgba[ii].b = mad(+fralunco,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
- color_acc->aN.rgba[ii].a = mad(-fralunco,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
- }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_blend_plus(union skc_tile_color * SKC_RESTRICT const color_acc,
- union skc_tile_cover const * SKC_RESTRICT const cover_wip,
- union skc_tile_color const * SKC_RESTRICT const color_wip)
-{
- //
- // cover_min = min(cover.wip,a.acc)
- //
- // r.acc = cover_min * r.wip + r.acc
- // g.acc = cover_min * g.wip + g.acc
- // b.acc = cover_min * b.wip + b.acc
- // a.acc = -cover_min * a.wip + a.acc
- //
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- {
- SKC_RENDER_TILE_COVER const cover_min = fmin(cover_wip->aN.c[ii],color_acc->aN.rgba[ii].a);
-
- color_acc->aN.rgba[ii].r = mad(+cover_min,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
- color_acc->aN.rgba[ii].g = mad(+cover_min,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
- color_acc->aN.rgba[ii].b = mad(+cover_min,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
- color_acc->aN.rgba[ii].a = mad(-cover_min,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
- }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_blend_multiply(union skc_tile_color * SKC_RESTRICT const color_acc,
- union skc_tile_cover const * SKC_RESTRICT const cover_wip,
- union skc_tile_color const * SKC_RESTRICT const color_wip)
-{
- //
- // r.acc = (cover.wip * r.wip) * r.acc
- // g.acc = (cover.wip * g.wip) * g.acc
- // b.acc = (cover.wip * b.wip) * b.acc
- // a.acc = (cover.wip * a.wip) * (1.0 - a.acc) <-- a.acc is already (1.0 - alpha)
- //
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- {
- color_acc->aN.rgba[ii].r *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].r;
- color_acc->aN.rgba[ii].g *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].g;
- color_acc->aN.rgba[ii].b *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].b;
- color_acc->aN.rgba[ii].a *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].a;
- }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_blend_knockout(union skc_tile_cover * SKC_RESTRICT const cover_acc,
- union skc_tile_color * SKC_RESTRICT const color_acc,
- union skc_tile_cover const * SKC_RESTRICT const cover_wip,
- union skc_tile_color const * SKC_RESTRICT const color_wip)
-{
- //
- // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
- // cover.acc = cover.acc + cover.wip.contrib
- //
- // r.acc = cover.wip.contrib * r.wip + r.acc
- // g.acc = cover.wip.contrib * g.wip + g.acc
- // b.acc = cover.wip.contrib * b.wip + b.acc
- // a.acc = -cover.wip.contrib * a.wip * a.acc
- //
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- {
- SKC_RENDER_TILE_COVER const contrib = (1 - cover_acc->aN.c[ii]) * cover_wip->aN.c[ii];
-
- cover_acc->aN.c[ii] += contrib;
-
- color_acc->aN.rgba[ii].r = mad(+contrib,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
- color_acc->aN.rgba[ii].g = mad(+contrib,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
- color_acc->aN.rgba[ii].b = mad(+contrib,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
- color_acc->aN.rgba[ii].a = mad(-contrib,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
- }
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_msk_copy_wip(union skc_tile_cover * SKC_RESTRICT const cover_msk,
- union skc_tile_cover const * SKC_RESTRICT const cover_wip)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- cover_msk->aN.c[ii] = cover_wip->aN.c[ii];
-
-#else
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
- cover_msk->vN.c[ii] = cover_wip->vN.c[ii];
-
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_msk_copy_acc(union skc_tile_cover * SKC_RESTRICT const cover_msk,
- union skc_tile_cover const * SKC_RESTRICT const cover_acc)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- cover_msk->aN.c[ii] = cover_acc->aN.c[ii];
-
-#else
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNTN)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
- cover_msk->vN.c[ii] = cover_acc->vN.c[ii];
-
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_accumulate(union skc_tile_cover * SKC_RESTRICT const cover_acc,
- union skc_tile_cover const * SKC_RESTRICT const cover_wip)
-{
- //
- // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
- // cover.acc = cover.acc + cover.wip.contrib
- //
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- cover_acc->aN.c[ii] = mad(1 - cover_acc->aN.c[ii],cover_wip->aN.c[ii],cover_acc->aN.c[ii]);
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_wip_mask(union skc_tile_cover * SKC_RESTRICT const cover_wip,
- union skc_tile_cover const * SKC_RESTRICT const cover_msk)
-{
- //
- // cover.wip *= cover.msk
- //
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- cover_wip->aN.c[ii] *= cover_msk->aN.c[ii];
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_wip_zero(union skc_tile_cover * SKC_RESTRICT const cover)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- cover->aN.c[ii] = 0;
-
-#else
- //
- // GEN9 compiler underperforms on this
- //
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
- cover->vN.c[ii] = 0;
-
-#endif
-}
-
-static
-void
-skc_tile_cover_acc_zero(union skc_tile_cover * SKC_RESTRICT const cover)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- cover->aN.c[ii] = 0;
-
-#else
- //
- // GEN9 compiler underperforms on this
- //
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
- cover->vN.c[ii] = 0;
-
-#endif
-}
-
-static
-void
-skc_tile_cover_msk_zero(union skc_tile_cover * SKC_RESTRICT const cover)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- cover->aN.c[ii] = 0;
-
-#else
- //
- // GEN9 compiler underperforms on this
- //
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
- cover->vN.c[ii] = 0;
-
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_msk_one(union skc_tile_cover * SKC_RESTRICT const cover)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- cover->aN.c[ii] = 1;
-
-#else
- //
- // GEN9 compiler underperforms on this
- //
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
- cover->vN.c[ii] = SKC_RENDER_TILE_COVER_VECTOR_ONE;
-
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_cover_msk_invert(union skc_tile_cover * SKC_RESTRICT const cover)
-{
-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- cover->aN.c[ii] = 1 - cover->aN.c[ii];
-
-#else
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
- cover->vN.c[ii] = 1 - cover->vN.c[ii];
-
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_color_wip_zero(union skc_tile_color * SKC_RESTRICT const color)
-{
-#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- {
- color->aN.rgba[ii].r = 0;
- color->aN.rgba[ii].g = 0;
- color->aN.rgba[ii].b = 0;
- color->aN.rgba[ii].a = 1;
- }
-
-#else
- //
- // DISABLED ON GEN9 -- probably a compiler bug
- //
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
- color->vN.rgba[ii].even.even = 0;
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
- color->vN.rgba[ii].odd.even = 0;
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
- color->vN.rgba[ii].even.odd = 0;
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
- color->vN.rgba[ii].odd.odd = 1;
-#endif
-}
-
-static
-void
-skc_tile_color_acc_zero(union skc_tile_color * SKC_RESTRICT const color)
-{
-#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- {
- color->aN.rgba[ii].r = 0;
- color->aN.rgba[ii].g = 0;
- color->aN.rgba[ii].b = 0;
- color->aN.rgba[ii].a = 1;
- }
-
-#else
- //
- // DISABLED ON GEN9 -- probably a compiler bug
- //
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
- color->vN.rgba[ii].even.even = 0;
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
- color->vN.rgba[ii].odd.even = 0;
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
- color->vN.rgba[ii].even.odd = 0;
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
- color->vN.rgba[ii].odd.odd = 1;
-#endif
-}
-
-//
-//
-//
-
-static
-bool
-skc_tile_color_test_opacity(union skc_tile_color const * SKC_RESTRICT const color)
-{
- //
- // returns true if tile is opaque
- //
- // various hacks to test for complete tile opacity
- //
- // note that front-to-back currently has alpha at 0.0f -- this can
- // be harmonized to use a traditional alpha if we want to support
- // rendering in either direction
- //
- // hack -- ADD/MAX/OR all alphas together and test for non-zero
- //
- SKC_RENDER_TILE_COLOR t = color->aN.rgba[0].a;
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
- for (uint ii=1; ii<SKC_TILE_WIDTH; ii++)
- t += color->aN.rgba[ii].a;
-
-#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
- //
- // SIMD
- //
- return !any(t != ( 0 ));
-
-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
- //
- // SIMT - scalar per lane
- //
- return !sub_group_any(t != 0);
-
-#else
- //
- // SIMT - vector per lane
- //
- return !sub_group_any(any(t != ( 0 )));
-
-#endif
-
- //
- // TODO: The alternative vector-per-lane implementation below is
- // *not* believed to be performant because the terse vector-wide
- // test is just hiding a series of comparisons and is likely worse
- // than the blind ADD/MAX/OR'ing of all alphas followed by a single
- // test.
- //
-#if 0
- //
- // SIMT - vector per lane
- //
-
- // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT-1)))
- for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
- {
- if (sub_group_any(any(color->vN.ba[ii].a != ( 0 ))))
- return false;
- }
-
- return true;
-#endif
-}
-
-//
-//
-//
-
-static
-void
-skc_tile_background_over(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
- uint * SKC_RESTRICT const cmd_next,
- union skc_tile_color * SKC_RESTRICT const color)
-{
- //
- // acc.r = acc.a * r + acc.r
- // acc.g = acc.a * g + acc.g
- // acc.b = acc.a * b + acc.b
- //
- __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;
-
- *cmd_next += 2;
-
- SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- color->aN.rgba[ii].r = mad(color->aN.rgba[ii].a,rg.lo,color->aN.rgba[ii].r);
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- color->aN.rgba[ii].g = mad(color->aN.rgba[ii].a,rg.hi,color->aN.rgba[ii].g);
-
- SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- color->aN.rgba[ii].b = mad(color->aN.rgba[ii].a,ba.lo,color->aN.rgba[ii].b);
-}
-
-//
-//
-//
-
-// #define SKC_SURFACE_IS_BUFFER
-#ifdef SKC_SURFACE_IS_BUFFER
-
-static
-void
-skc_surface_composite_u8_rgba(__global SKC_RENDER_SURFACE_U8_RGBA * SKC_RESTRICT const surface,
- skc_uint const surface_pitch,
- union skc_tile_color const * SKC_RESTRICT const color,
- skc_ttck_hi_t const ttck_hi)
-{
- //
- // NEW MAJOR OPTIMIZATION:
- //
- // Rotating and rasterizing the original world transform by -90
- // degrees and then rendering the scene scene by +90 degrees enables
- // all the final surface composite to be perfomed in perfectly
- // coalesced wide transactions.
- //
- // For this reason, linear access to the framebuffer is preferred.
- //
- // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
- //
- // NOTE THIS IS TRANSPOSED BY 90 DEGREES
- //
- // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
- // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
- //
- // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
- // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
- //
- // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
- // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
- //
- uint const pitch = surface_pitch / SKC_RENDER_SCANLINE_VECTOR_SIZE;
- uint const x = skc_ttck_hi_get_x(ttck_hi);
- uint const y = skc_ttck_hi_get_y(ttck_hi) ;
- uint const base = x * SKC_TILE_WIDTH * pitch + y * (SKC_TILE_HEIGHT / SKC_RENDER_SCANLINE_VECTOR_SIZE) + skc_subgroup_lane();
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- {
- SKC_RENDER_SURFACE_U8_RGBA rgba = ( 0xFF000000 );
-
- rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].r * 255);
- rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].g * 255) << 8;
- rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].b * 255) << 16;
-
- surface[base + ii * pitch] = rgba;
-
- // printf("%08v2X\n",rgba);
- }
-}
-
-#else
-
-static
-void
-skc_surface_composite_u8_rgba(__write_only image2d_t surface,
- union skc_tile_color const * SKC_RESTRICT const color,
- skc_ttck_hi_t const ttck_hi)
-{
- //
- // NEW MAJOR OPTIMIZATION:
- //
- // Rotating and rasterizing the original world transform by -90
- // degrees and then rendering the scene scene by +90 degrees enables
- // all the final surface composite to be perfomed in perfectly
- // coalesced wide transactions.
- //
- // For this reason, linear access to the framebuffer is preferred.
- //
- // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
- //
- // NOTE THIS IS TRANSPOSED BY 90 DEGREES
- //
- // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
- // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
- //
- // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
- // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
- //
- // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
- // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
- //
-
-#if 1
- int x = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;
- int y = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- {
-#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) { \
- SKC_RENDER_SURFACE_WRITE(surface, \
- (int2)(x,y+I), \
- color->iN.rgba[ii] A); \
- }
-
-#else
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) { \
- SKC_RENDER_SURFACE_COLOR const rgba = \
- (SKC_RENDER_SURFACE_COLOR) \
- (color->aN.rgba[ii].r C, \
- color->aN.rgba[ii].g C, \
- color->aN.rgba[ii].b C, \
- 1.0); \
- SKC_RENDER_SURFACE_WRITE(surface,(int2)(x,y+I),rgba); \
- }
-
-#endif
-
- SKC_RENDER_SCANLINE_VECTOR_EXPAND();
-
- x += 1;
- }
-#else
- int x = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);
- int y = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;
-
- // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
- for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
- {
-#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) { \
- SKC_RENDER_SURFACE_WRITE(surface, \
- (int2)(x+I,y+ii), \
- color->iN.rgba[ii] A); \
- }
-
-#else
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,A) { \
- SKC_RENDER_SURFACE_COLOR const rgba = \
- (SKC_RENDER_SURFACE_COLOR) \
- (color->aN.rgba[ii].r C, \
- color->aN.rgba[ii].g C, \
- color->aN.rgba[ii].b C, \
- 1.0); \
- SKC_RENDER_SURFACE_WRITE(surface,(int2)(x+I,y+ii),rgba); \
- }
-
-#endif
-
- SKC_RENDER_SCANLINE_VECTOR_EXPAND();
- }
-
-#endif
-}
-
-#endif
-
-//
-//
-//
-static
-uint const
-skc_ttck_lane(uint const ttck_idx)
-{
- return ttck_idx & SKC_RENDER_SUBGROUP_MASK;
-}
-
-//
-// RENDER KERNEL
-//
-
-__kernel
-SKC_RENDER_KERNEL_ATTRIBS
-void
-skc_kernel_render(__global union skc_layer_node const * SKC_RESTRICT const layers,
- __global struct skc_group_node const * SKC_RESTRICT const groups,
- __global union skc_styling_cmd const * SKC_RESTRICT const commands, // FIXME -- rename
-
- __global skc_ttck_t const * SKC_RESTRICT const ttck_keys, // rename: keys
- skc_uint const ttck_count, // rename: key_count
-
- __global uint const * SKC_RESTRICT const ttck_offsets, // rename: offsets
- skc_uint const tile_count, // rename: offset_count
-
- __global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent,
-#ifdef SKC_SURFACE_IS_BUFFER
- __global void * SKC_RESTRICT const surface,
-#else
- __write_only image2d_t surface,
-#endif
-#ifdef SKC_SURFACE_IS_BUFFER
- skc_uint const surface_pitch,
-#endif
- uint4 const tile_clip) // rename: clip
-{
- //
- // Each subgroup is responsible for a tile. No extra subgroups are
- // launched.
- //
- // FIXME -- might be better implemented as a "grid stride loop" if
- // Intel GEN really has a local memory "quantum" of 4KB which means
- // we would need to launch 4 subgroups per workgroup.
- //
- // Confirmed: GEN8 has 4KB SLM workgroup min while GEN9 is 1KB.
- //
-
- //
- // declare tile cover and color registers
- //
- // this used to be a neat unified struct but the Intel GEN compiler
- // wasn't cooperating and spilling to private memory even though all
- // registers were indexed by constants
- //
- union skc_tile_color color_wip;
- union skc_tile_color color_acc;
-
- union skc_tile_cover cover_wip;
- union skc_tile_cover cover_acc;
- union skc_tile_cover cover_msk;
-
- //
- // which subgroup in the grid is this?
- //
- // TAKE NOTE: the Intel GEN compiler is recognizing get_group_id(0)
- // as a uniform but the alternative calculation used when there are
- // multiple subgroups per workgroup is not cooperating and
- // driving spillage elsewhere.
- //
-#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
- skc_uint const ttck_offset_idx = get_group_id(0);
-#else
- skc_uint const ttck_offset_idx = get_group_id(0) * SKC_RENDER_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
- //
- // load the starting ttck for this offset and get a bound on the max
- // number of keys that might be loaded
- //
- // these are uniform across all subgroup lanes
- //
- skc_uint ttck_idx = ttck_offsets[ttck_offset_idx];
-
- //
- // FIXME -- SIMD/CPU version should probaby load a 256-bit (4-wide)
- // vector of ttck keys
- //
-#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
-
- skc_ttck_t ttck = ttck_keys[ttck_idx];
-
-#else
-
- uint const ttck_base = ttck_idx & ~SKC_RENDER_SUBGROUP_MASK;
- uint const ttck_lane = ttck_idx & SKC_RENDER_SUBGROUP_MASK;
- skc_ttck_t ttck_s = ttck_keys[min(ttck_base+max(get_sub_group_local_id(),ttck_lane),ttck_count-1)]
-
-#endif
-
- //
- // set up style group/layer state
- //
- struct skc_styling_group {
- union skc_group_range range;
- skc_uint depth;
- skc_uint id;
- } group;
-
- group.range.lo = 0;
- group.range.hi = SKC_UINT_MAX;
- group.depth = 0;
- group.id = SKC_UINT_MAX;
-
- //
- // start with clear tile opacity, knockout and flag bits
- //
- // uint color_acc_opacity = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
- // uint cover_acc_knockout = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
- //
- skc_uint flags = 0;
-
- //
- // declare and initialize accumulators
- //
-#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
- __local union skc_subgroup_smem smem[1];
-#else
- __local union skc_subgroup_smem smem_wg[SKC_RENDER_WORKGROUP_SUBGROUPS];
- __local union skc_subgroup_smem * SKC_RESTRICT const smem = smem_wg + get_sub_group_id();
-#endif
-
-#ifdef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
- //
- // select the initial ttck key
- //
- skc_ttck_t ttck;
-#if 0
- ttck = sub_group_broadcast(ttck_s,ttck_lane); // SHOULD WORK BUT .4454 COMPILER IS BROKEN
-#else
- ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane); // EXPLICIT WORKAROUND
- ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane);
-#endif
-
-#endif
-
- //
- // save the first key so we know what tile we're in
- //
- skc_ttck_t ttck0 = ttck;
-
- //
- // evaluate the coarse clip as late as possible
- //
- skc_uint const ttck_hi_x = skc_ttck_hi_get_x(ttck0.hi);
-
- if ((ttck_hi_x < tile_clip.lo.x) || (ttck_hi_x >= tile_clip.hi.x))
- return;
-
- skc_uint const ttck_hi_y = skc_ttck_hi_get_y(ttck0.hi);
-
- if ((ttck_hi_y < tile_clip.lo.y) || (ttck_hi_y >= tile_clip.hi.y))
- return;
-
-#if 0
- printf("< %u, %u >\n",ttck_hi_x,ttck_hi_y);
-#endif
-
- //
- // load -> scatter -> flush
- //
- while (true)
- {
- // if scattering is disabled then just run through ttck keys
- bool const is_scatter_enabled = (flags & SKC_TILE_FLAGS_SCATTER_SKIP) == 0;
-
- // need to clear accumulators before a scatter loop
- if (is_scatter_enabled)
- {
- skc_tile_aa_zero(smem);
- }
-
- do {
- // skip scattering?
- if (is_scatter_enabled)
- {
- skc_block_id_t const xb_id = skc_ttck_lo_get_ttxb_id(ttck.lo);
-
- if (skc_ttck_lo_is_prefix(ttck.lo)) {
- skc_scatter_ttpb(ttxb_extent,smem,xb_id);
- } else {
- skc_scatter_ttsb(ttxb_extent,smem,xb_id);
- }
- }
-
- //
- // any ttck keys left?
- //
- if (++ttck_idx >= ttck_count)
- {
- flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
- break;
- }
-
- //
- // process next ttck key
- //
-#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
- //
- // SIMD -- read next key
- //
- ttck = ttck_keys[ttck_idx];
-#else
- //
- // SIMT -- refresh the ttck_s?
- //
- uint const ttck_lane_next = ttck_idx & SKC_RENDER_SUBGROUP_MASK;
-
- if (ttck_lane_next == 0)
- ttck_s = ttck_keys[min(ttck_idx+get_sub_group_local_id(),ttck_count-1)];
-
- //
- // broadcast next key to entire subgroup
- //
-#if 0
- ttck = sub_group_broadcast(ttck_s,ttck_lane_next); // SHOULD WORK BUT .4454 COMPILER IS BROKEN
-#else
- ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane_next); // EXPLICIT WORKAROUND
- ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane_next);
-#endif
-#endif
- // continue scattering if on same YXL layer
- } while (skc_ttck_equal_yxl(ttck0,ttck));
-
- // finalize if no longer on same YX tile
- if (!skc_ttck_hi_equal_yx(ttck0.hi,ttck.hi))
- {
- // otherwise, unwind the tile styling and exit
- flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
- }
-
- //
- // given: new layer id from ttxk key
- //
- // load [layer id]{ group id, depth }
- //
- // if within current group's layer range
- //
- // if at same depth
- //
- // load and execute cover>[mask>]color>blend commands
- //
- // else if not at same depth then move deeper
- //
- // for all groups in group trail from cur depth to new depth
- // enter group, saving and initializing regs as necessary
- // increment depth and update layer range
- // load and execute cover>[mask>]color>blend commands
- //
- // else not within layer range
- //
- // exit current group, restoring regs as necessary
- // decrement depth and update layer range
- //
- //
- skc_layer_id const layer_id_new = skc_ttck_get_layer(ttck0); // FIXME -- this was ttck_hi
- union skc_layer_node const layer_node_new = layers[layer_id_new];
-
- // clear flag that controls group/layer traversal
- flags &= ~SKC_TILE_FLAGS_FLUSH_COMPLETE;
-
- do {
- bool const unwind = (flags & SKC_TILE_FLAGS_FLUSH_UNWIND) != 0;
-
- //
- // is layer a child of the current parent group?
- //
- uint cmd_next = 0;
-
- if (!unwind && (layer_node_new.parent == group.id))
- {
- // execute this layer's cmds
- cmd_next = layer_node_new.cmds;
-
- // if this is final then configure so groups get unwound, otherwise we're done
- flags |= ((flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) ? SKC_TILE_FLAGS_FLUSH_UNWIND : SKC_TILE_FLAGS_FLUSH_COMPLETE);
- }
- else if (!unwind && (layer_id_new >= group.range.lo && layer_id_new <= group.range.hi))
- {
- //
- // is layer in a child group?
- //
- union skc_group_parents const gp = groups[layer_node_new.parent].parents;
- uint const gn = gp.depth - ++group.depth;
-
- if (gn == 0)
- group.id = layer_node_new.parent;
- else
- group.id = commands[gp.base + gn - 1].parent;
-
- // update group layer range
- group.range = groups[group.id].range;
-
- // enter current group
- cmd_next = groups[group.id].cmds.enter;
- }
- else // otherwise, exit this group
- {
- // enter current group
- cmd_next = groups[group.id].cmds.leave;
-
- // decrement group depth
- if (--group.depth == 0)
- {
- flags |= SKC_TILE_FLAGS_FLUSH_COMPLETE;
- }
- else
- {
- // get path_base of current group
- uint const gnpb = groups[group.id].parents.base;
-
- // get parent of current group
- group.id = commands[gnpb].parent;
-
- // update group layer range
- group.range = groups[group.id].range;
- }
- }
-
- //
- // execute cmds
- //
- while (true)
- {
- union skc_styling_cmd const cmd = commands[cmd_next++];
-
- switch (cmd.u32 & SKC_STYLING_OPCODE_MASK_OPCODE)
- {
- case SKC_STYLING_OPCODE_NOOP:
- break;
-
- case SKC_STYLING_OPCODE_COVER_NONZERO:
- skc_tile_cover_nonzero(smem,&cover_wip,&color_wip);
- break;
-
- case SKC_STYLING_OPCODE_COVER_EVENODD:
- skc_tile_cover_evenodd(smem,&cover_wip,&color_wip);
- break;
-
- case SKC_STYLING_OPCODE_COVER_ACCUMULATE:
- skc_tile_cover_accumulate(&cover_acc,&cover_wip);
- break;
-
- case SKC_STYLING_OPCODE_COVER_MASK:
- skc_tile_cover_wip_mask(&cover_wip,&cover_msk);
- break;
-
- case SKC_STYLING_OPCODE_COVER_WIP_ZERO:
- skc_tile_cover_wip_zero(&cover_wip);
- break;
-
- case SKC_STYLING_OPCODE_COVER_ACC_ZERO:
- skc_tile_cover_acc_zero(&cover_acc);
- break;
-
- case SKC_STYLING_OPCODE_COVER_MASK_ZERO:
- skc_tile_cover_msk_zero(&cover_msk);
- break;
-
- case SKC_STYLING_OPCODE_COVER_MASK_ONE:
- skc_tile_cover_msk_one(&cover_msk);
- break;
-
- case SKC_STYLING_OPCODE_COVER_MASK_INVERT:
- skc_tile_cover_msk_invert(&cover_msk);
- break;
-
- case SKC_STYLING_OPCODE_COLOR_FILL_SOLID:
- skc_tile_color_fill_solid(commands,&cmd_next,&color_wip);
- break;
-
- case SKC_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR:
- //
- // FIXME -- gradients shouldn't be executing so much
- // conditional driven code at runtime since we *know*
- // the gradient style on the host can just create a
- // new styling command to exploit this.
- //
- // FIXME -- it might be time to try using the GPU's
- // sampler on a linear array of half4 vectors -- it
- // might outperform the explicit load/lerp routines.
- //
- // FIXME -- optimizing for vertical gradients (uhhh,
- // they're actually horizontal due to the -90 degree
- // view transform) is nice but is it worthwhile to
- // have this in the kernel? Easy to add it back...
- //
-#if defined( SKC_ARCH_GEN9 )
- // disable gradients due to exessive spillage -- fix later
- cmd_next += SKC_GRADIENT_CMD_WORDS_V1(commands[cmd_next+6].u32);
-#else
- skc_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi);
-#endif
- break;
-
- case SKC_STYLING_OPCODE_COLOR_WIP_ZERO:
- skc_tile_color_wip_zero(&color_wip);
- break;
-
- case SKC_STYLING_OPCODE_COLOR_ACC_ZERO:
- skc_tile_color_acc_zero(&color_acc);
- break;
-
- case SKC_STYLING_OPCODE_BLEND_OVER:
- skc_tile_blend_over(&color_acc,&cover_wip,&color_wip);
- break;
-
- case SKC_STYLING_OPCODE_BLEND_PLUS:
- skc_tile_blend_plus(&color_acc,&cover_wip,&color_wip);
- break;
-
- case SKC_STYLING_OPCODE_BLEND_MULTIPLY:
- skc_tile_blend_multiply(&color_acc,&cover_wip,&color_wip);
- break;
-
- case SKC_STYLING_OPCODE_BLEND_KNOCKOUT:
- skc_tile_blend_knockout(&cover_acc,&color_acc,&cover_wip,&color_wip);
- break;
-
- case SKC_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK:
- // skc_tile_cover_msk_copy_wip(&cover_msk,&cover_wip);
- break;
-
- case SKC_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK:
- // skc_tile_cover_msk_copy_acc(&cover_msk,&cover_acc);
- break;
-
- case SKC_STYLING_OPCODE_BACKGROUND_OVER:
- skc_tile_background_over(commands,&cmd_next,&color_acc);
- break;
-
- case SKC_STYLING_OPCODE_SURFACE_COMPOSITE:
-#ifdef SKC_SURFACE_IS_BUFFER
- skc_surface_composite_u8_rgba(surface,surface_pitch,&color_acc,ttck0.hi);
-#else
- skc_surface_composite_u8_rgba(surface, &color_acc,ttck0.hi);
-#endif
- break;
-
- case SKC_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY:
- if (skc_tile_color_test_opacity(&color_acc))
- flags |= SKC_TILE_FLAGS_SCATTER_SKIP;
- break;
-
- default:
- return; // this is an illegal opcode -- trap and die!
- }
-
- //
- // if sign bit is set then this was final command
- //
- if (cmd.s32 < 0)
- break;
- }
-
- // continue as long as tile flush isn't complete
- } while ((flags & SKC_TILE_FLAGS_FLUSH_COMPLETE) == 0);
-
- // return if was the final flush
- if (flags & SKC_TILE_FLAGS_FLUSH_FINALIZE)
- return;
-
- // update wip ttck_hi
- ttck0 = ttck;
- }
-}
-
-//
-//
-//
+/* + * Copyright 2016 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "tile.h" +#include "block.h" +#include "styling_types.h" +#include "atomic_cl.h" +#include "kernel_cl_12.h" + +// +// +// + +#define SKC_RENDER_SUBGROUP_MASK (SKC_RENDER_SUBGROUP_SIZE - 1) + +// +// +// + +#if ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 ) +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_1() +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 0 + +#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 2 ) +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_2() +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 1 + +#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 4 ) +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_4() +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 3 + +#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 8 ) +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_8() +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 7 + +#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 16) +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_16() +#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 15 +#endif + +// +// tile state flag bits +// + +typedef enum skc_tile_flags_e { + + // FLUSH + SKC_TILE_FLAGS_FLUSH_FINALIZE = 0x00000001, + SKC_TILE_FLAGS_FLUSH_UNWIND = 0x00000002, + SKC_TILE_FLAGS_FLUSH_COMPLETE = 0x00000004, + + // OPACITY + SKC_TILE_FLAGS_SCATTER_SKIP = 0x00000008, + + // + // Note: testing for opacity and skipping scattering is on its way + // to becoming a much more programmable option because sometimes we + // may be compositing/blending from back-to-front and/or be using + // group blend rules that ignore opacity. + // + // The point is that all of these decisions should be encoded in + // styling commands and, as much as possible, removed from the final + // group/layer styling traversal render loop. + // + +} skc_tile_flags_e; + +// +// COVER -- assumes availability of either fp16 or fp32 +// + +union skc_tile_cover +{ + struct { + SKC_RENDER_TILE_COVER c[SKC_TILE_WIDTH]; + } aN; + +#ifdef SKC_RENDER_TILE_COVER_VECTOR + struct { + SKC_RENDER_TILE_COVER_VECTOR c[SKC_RENDER_TILE_COVER_VECTOR_COUNT]; + } vN; +#endif +}; + +// +// COLOR -- assumes availability of either fp16 or fp32 +// + +union skc_tile_color +{ + union { + struct { + SKC_RENDER_TILE_COLOR r; + SKC_RENDER_TILE_COLOR g; + SKC_RENDER_TILE_COLOR b; + SKC_RENDER_TILE_COLOR a; + } rgba[SKC_TILE_WIDTH]; + } aN; + +#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED + union { + SKC_RENDER_TILE_COLOR_INTERLEAVED rgba[SKC_TILE_WIDTH]; + } iN; +#endif + +#ifdef SKC_RENDER_TILE_COLOR_VECTOR + union { + SKC_RENDER_TILE_COLOR_VECTOR rgba[SKC_RENDER_TILE_COLOR_VECTOR_COUNT]; + } vN; +#endif + + struct { + union { + struct { + SKC_RENDER_TILE_COLOR r; + SKC_RENDER_TILE_COLOR g; + }; + SKC_RENDER_GRADIENT_FLOAT distance; + }; + union { + struct { + SKC_RENDER_TILE_COLOR b; + SKC_RENDER_TILE_COLOR a; + }; + SKC_RENDER_GRADIENT_FLOAT stoplerp; + }; + } grad[SKC_TILE_WIDTH]; +}; + +// +// SHARED MEMORY STATE +// + +#define SKC_RENDER_TILE_SMEM_WORDS ((SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT) + +#define SKC_RENDER_WIDE_AA_BYTES (SKC_RENDER_TILE_SMEM_WORDS * sizeof(int) / SKC_RENDER_SUBGROUP_SIZE) +#define SKC_RENDER_WIDE_AA_WIDTH (SKC_RENDER_WIDE_AA_BYTES / sizeof(SKC_RENDER_WIDE_AA)) + +// +// +// + +union skc_subgroup_smem +{ + // + // The tiles are stored in column-major / height-major order + // + // The final column is a guard column that is OK to write to but + // will never be read. It simplifies the TTSB scatter but could be + // predicated if SMEM is really at a premium. + // +#if ( SKC_RENDER_SUBGROUP_SIZE > 1 ) + struct { + SKC_ATOMIC_UINT area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h] + } atomic; +#endif + + struct { + int area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h] + } aN; + + struct { // assumption is that height = subgroup + SKC_RENDER_AREA_V area[SKC_TILE_WIDTH + 1][SKC_RENDER_SUBGROUP_SIZE]; + } vN; + + struct { // assumption is that height = subgroup + SKC_RENDER_WIDE_AA area[SKC_RENDER_WIDE_AA_WIDTH][SKC_RENDER_SUBGROUP_SIZE]; + } wide; + + union skc_styling_cmd cmds[(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT]; + + half gc [(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT * 2]; + +#if 0 + // + // SPILL TO GMEM + // +#if (SKC_REGS_COLOR_S > 0) || (SKC_REGS_COVER_S > 0) + struct { + +#if (SKC_REGS_COLOR_S > 0) + union skc_color_r color[SKC_REGS_COLOR_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH]; +#endif + +#if (SKC_REGS_COVER_S > 0) + union float cover[SKC_REGS_COVER_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH]; +#endif + + } regs; +#endif + // + // + // +#endif +}; + +// +// +// + +#if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) + +#define skc_subgroup_lane() 0 + +#else + +#define skc_subgroup_lane() get_sub_group_local_id() + +#endif + +// +// +// + +typedef skc_uint skc_ttsk_lo_t; +typedef skc_uint skc_ttsk_hi_t; + +typedef skc_uint skc_ttpk_lo_t; +typedef skc_uint skc_ttpk_hi_t; + +typedef skc_uint skc_ttxk_lo_t; +typedef skc_uint skc_ttxk_hi_t; + +typedef skc_uint skc_ttck_lo_t; +typedef skc_uint skc_ttck_hi_t; + +typedef skc_uint2 skc_ttck_t; + +typedef skc_int skc_ttxb_t; + +// +// TTCK (32-BIT COMPARE) v1: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 30 | 1 | 1 | 18 | 7 | 7 | +// +// +// TTCK (32-BIT COMPARE) v2: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 30 | 1 | 1 | 15 | 9 | 8 | +// +// +// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 27 | 1 | 1 | 18 | 9 | 8 | +// + +static +skc_uint +skc_ttck_lo_get_ttxb_id(skc_ttck_lo_t const a) +{ + return a & SKC_TTCK_LO_MASK_ID; +} + +static +skc_layer_id +skc_ttck_get_layer(skc_ttck_t const a) +{ + // + // FIXME -- a union with a ulong and a shift down and mask is + // probably faster on some architectures + // + skc_uint const lo = (a.lo >> SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); + skc_uint const hi = (a.hi & SKC_TTCK_HI_MASK_LAYER) << SKC_TTCK_LO_BITS_LAYER; + + return lo | hi; +} + +static +skc_uint +skc_ttck_hi_get_x(skc_ttck_hi_t const a) +{ + return SKC_BFE(a,SKC_TTCK_HI_BITS_X,SKC_TTCK_HI_OFFSET_X); +} + +static +skc_uint +skc_ttck_hi_get_y(skc_ttck_hi_t const a) +{ + return a >> SKC_TTCK_HI_OFFSET_Y; +} + +static +skc_bool +skc_ttck_equal_yxl(skc_ttck_t const a, skc_ttck_t const b) +{ + skc_uint const lo = (a.lo ^ b.lo) & SKC_BITS_TO_MASK_AT(SKC_TTCK_LO_BITS_LAYER,SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); + skc_uint const hi = (a.hi ^ b.hi); + + return (lo | hi) == 0; +} + +static +skc_bool +skc_ttck_hi_equal_yx(skc_ttck_hi_t const a, skc_ttck_hi_t const b) +{ + return ((a ^ b) & SKC_TTCK_HI_MASK_YX) == 0; +} + +static +skc_bool +skc_ttck_lo_is_prefix(skc_ttck_lo_t const a) +{ + return (a & SKC_TTCK_LO_MASK_PREFIX) != 0; +} + +// +// TILE TRACE SUBPIXEL +// +// The subpixels are encoded with either absolute tile coordinates +// (32-bits) or packed in delta-encoded form form. +// +// For 32-bit subpixel packing of a 32x32 tile: +// +// A tile X is encoded as: +// +// TX : 10 : unsigned min(x0,x1) tile subpixel coordinate. +// +// SX : 6 : unsigned subpixel span from min to max x with range +// [0,32]. The original direction is not captured. Would +// be nice to capture dx but not necessary right now but +// could be in the future. <--- SPARE VALUES AVAILABLE +// +// A tile Y is encoded as: +// +// TY : 10 : unsigned min(y0,y1) tile subpixel coordinate. +// +// DY : 6 : signed subpixel delta y1-y0. The range of delta is +// [-32,32] but horizontal lines are not encoded so [1,32] +// is mapped to [0,31]. The resulting range [-32,31] fits +// in 6 bits. +// +// TTS: +// +// 0 31 +// | TX | SX | TY | DY | +// +-----+------+-----+------+ +// | 10 | 6 | 10 | 6 | +// + +static +SKC_RENDER_TTS_V_BITFIELD +skc_tts_get_ty_pixel_v(SKC_RENDER_TTS_V const a) +{ + // + // extract the whole pixel y coordinate + // + return SKC_BFE(a, + SKC_TTS_BITS_TY - SKC_SUBPIXEL_RESL_Y_LOG2, + SKC_TTS_OFFSET_TY + SKC_SUBPIXEL_RESL_Y_LOG2); +} + +static +SKC_RENDER_TTS_V_BITFIELD +skc_tts_get_xy_idx_v(SKC_RENDER_TTS_V const a) +{ + // + // get the linear array tile index of the pixel + // + return (((a & SKC_TTS_MASK_TX_PIXEL) + +#if (SKC_SUBPIXEL_RESL_X_LOG2 > SKC_TILE_HEIGHT_LOG2) + >> (SKC_SUBPIXEL_RESL_X_LOG2 - SKC_TILE_HEIGHT_LOG2) +#elif (SKC_SUBPIXEL_RESL_X_LOG2 < SKC_TILE_HEIGHT_LOG2) + << (SKC_TILE_HEIGHT_LOG2 - SKC_SUBPIXEL_RESL_X_LOG2) +#endif + + ) | skc_tts_get_ty_pixel_v(a)); +} + +#if 0 +static +skc_ttx_v_s32_t +skc_tts_get_dy_v(SKC_RENDER_TTS_V const a) +{ + skc_ttx_v_s32_t const dy = SKC_AS(skc_ttx_v_s32_t)a >> SKC_TTS_OFFSET_DY; + + return (dy + SKC_AS(skc_ttx_v_s32_t)(~a >> 31)); +} +#else +static +SKC_RENDER_TTS_V_BITFIELD +skc_tts_get_dy_v(SKC_RENDER_TTS_V const a) +{ + SKC_RENDER_TTS_V_BITFIELD const dy = a >> SKC_TTS_OFFSET_DY; + + return dy - (~a >> 31); +} +#endif + +static +SKC_RENDER_TTS_V_BITFIELD +skc_tts_get_tx_subpixel_v(SKC_RENDER_TTS_V const a) +{ + return a & SKC_BITS_TO_MASK(SKC_SUBPIXEL_RESL_X_LOG2); +} + +static +SKC_RENDER_TTS_V_BITFIELD +skc_tts_get_sx_v(SKC_RENDER_TTS_V const a) +{ + return SKC_BFE(a,SKC_TTS_BITS_SX,SKC_TTS_OFFSET_SX); +} + +// +// +// + +static +void +skc_tile_aa_zero(__local union skc_subgroup_smem * SKC_RESTRICT const smem) +{ + // + // SIMD / CPU + // + // & + // + // SIMT / GPU + // + // Note that atomic_init() is likely implemented as a simple + // assignment so there is no identifiable performance difference on + // current targets. + // + // If such an architecture appears in the future then we'll probably + // still want to implement this zero'ing operation as below but + // follow with an appropriate fence that occurs before any scatter + // operations. + // + // The baroque expansion below improves performance on Intel GEN by, + // presumably, achieving the 64-byte per clock SLM write as well as + // minimizing the overall number of SEND() block initializations and + // launches. + // + // Intel GENx has a documented 64 byte per cycle SLM write limit. + // So having each lane in an 8 lane subgroup zero-write 8 bytes is + // probably a safe bet (Later: benchmarking backs this up!). + // + // Note there is no reason at this time to unroll this loop. + // + for (uint ii=0; ii<SKC_RENDER_WIDE_AA_WIDTH; ii++) + smem->wide.area[ii][skc_subgroup_lane()] = ( 0 ); +} + +// +// Note this is going to be vectorizable on most architectures. +// +// The return of the key translation feature might complicate things. +// + +static +void +skc_scatter_ttpb(__global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, + __local union skc_subgroup_smem * SKC_RESTRICT const smem, + skc_block_id_t const pb_id) +{ + skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane(); + +#if ( SKC_TILE_RATIO == 1 ) + + SKC_RENDER_TTP_V const ttp_v = ttxb_extent[offset]; + +#elif ( SKC_TILE_RATIO == 2 ) + + SKC_RENDER_TTP_V const ttp_v = vload2(offset,ttxb_extent); + +#else + +#error("tile ratio greater than 2 not supported") + +#endif + + // + // Note there is no need to use an atomic for this operation on the + // current group of target platforms... but this may change if + // atomic ops truly go through a different path. + // + // As noted above, this direct increment is probably faster and can + // always be followed by a fence. + // + // Furthermore, note that the key sorting orders all ttck keys + // before ttpk keys. + // + + // + // FIXME -- if the SMEM store is wider than bank word count then we + // might want to odd-even interleave the TTP values if the target + // device can't handle 64-bit stores + // + + // + // skipping per-key translation for now + // + smem->vN.area[0][skc_subgroup_lane()] += ttp_v << (SKC_SUBPIXEL_RESL_X_LOG2 + 1); +} + +// +// Note that skc_scatter_ttsb is *not* vectorizable unless the +// architecture supports a "scatter-add" capability. All relevant +// GPUs support atomic add on shared/local memory and thus support +// scatter-add. +// + +static +void +skc_scatter_ttsb(__global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, + __local union skc_subgroup_smem * SKC_RESTRICT const smem, + skc_block_id_t const sb_id) +{ + skc_uint const offset = sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); + + SKC_RENDER_TTS_V const tts_v = ttxb_extent[offset]; + + // + // Skipping per-key translation for now + // + + // Index into tile + // + // The tiles are stored in column-major / height-major order + // + // The final column is a guard column that is OK to write to but + // will never be read. It simplifies the TTSB scatter but could be + // predicated if SMEM is really at a premium. + // + + SKC_RENDER_TTS_V_BITFIELD const xy_idx = skc_tts_get_xy_idx_v(tts_v); + +#if 0 + if (tts_v != SKC_TTS_INVALID) + printf("(%08X) = %u\n",tts_v,xy_idx); +#endif + + // + // adjust subpixel range to max y + // + // range is stored as [-32,31] and when read [0,31] is mapped to + // [1,32] because a dy of 0 is not possible. + // + // more succinctly: if dy >= 0 then ++dy + // + SKC_RENDER_TTS_V_BITFIELD const dy = skc_tts_get_dy_v(tts_v); + + // + // FIXME -- benchmark performance of setting dy to 0 if ttsv.vN is invalid? + // + + // this "min(x0) * 2 + dx" is equivalent to "x0 + x1" + SKC_RENDER_TTS_V_BITFIELD const widths = skc_tts_get_tx_subpixel_v(tts_v) * 2 + skc_tts_get_sx_v(tts_v); + + // Calculate left and right coverage contribution trapezoids + SKC_RENDER_TTS_V_BITFIELD const left = dy * widths; + SKC_RENDER_TTS_V_BITFIELD const right = (dy << (SKC_SUBPIXEL_RESL_X_LOG2 + 1)) - left; + + // + // Accumulate altitudes and areas + // + // Optimization: if the device supports an CPU/SIMD vector-add or + // GPU/SIMT scatter-add atomic int2 add operation then placing the + // ALT and AREA values side-by-side would halve the number of + // additions. + // +#if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) + // + // CPU/SIMD + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + if (tts_v C != SKC_TTS_INVALID) { \ + smem->aN.area[SKC_TILE_HEIGHT + xy_idx C] += left C; \ + smem->aN.area[ xy_idx C] += right C; \ + } + +#else + // + // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) \ + if (tts_v C != SKC_TTS_INVALID) { \ + SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + \ + SKC_TILE_HEIGHT + xy_idx C, \ + left C); \ + SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + xy_idx C, \ + right C); \ + } +#endif + + SKC_RENDER_TTSB_EXPAND(); +} + +// +// Note that 2048.0 can be represented exactly with fp16... fortuitous! +// + +#define SKC_RENDER_FILL_MAX_AREA (2u * SKC_SUBPIXEL_RESL_X * SKC_SUBPIXEL_RESL_Y) +#define SKC_RENDER_FILL_MAX_AREA_2 (2u * SKC_RENDER_FILL_MAX_AREA) +#define SKC_RENDER_FILL_EVEN_ODD_MASK (SKC_RENDER_FILL_MAX_AREA_2 - 1) +#define SKC_RENDER_FILL_MAX_AREA_RCP_F32 (SKC_RENDER_TILE_COVER)(1.0f / SKC_RENDER_FILL_MAX_AREA) + +// +// +// + +static +void +skc_tile_cover_nonzero(__local union skc_subgroup_smem * SKC_RESTRICT const smem, + union skc_tile_cover * SKC_RESTRICT const cover, + union skc_tile_color * SKC_RESTRICT const color) +{ + SKC_RENDER_ACC_COVER_INT area = 0; + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2 + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + { + area += smem->vN.area[ii][skc_subgroup_lane()]; + SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area); + SKC_RENDER_TILE_COVER const nonzero = SKC_CONVERT(SKC_RENDER_TILE_COVER)(min(trapabs,SKC_RENDER_FILL_MAX_AREA)); + + cover->aN.c[ii] = nonzero * (SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA_RCP_F32); + } +} + +static +void +skc_tile_cover_evenodd(__local union skc_subgroup_smem * SKC_RESTRICT const smem, + union skc_tile_cover * SKC_RESTRICT const cover, + union skc_tile_color * SKC_RESTRICT const color) +{ + SKC_RENDER_ACC_COVER_INT area = 0; + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2 + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + { + area += smem->vN.area[ii][skc_subgroup_lane()]; + SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area); + SKC_RENDER_ACC_COVER_UINT const reflect = abs(SKC_AS(SKC_RENDER_ACC_COVER_INT)((trapabs & SKC_RENDER_FILL_EVEN_ODD_MASK) - SKC_RENDER_FILL_MAX_AREA)); + + cover->aN.c[ii] = SKC_CONVERT(SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA - reflect) * (SKC_RENDER_TILE_COVER)SKC_RENDER_FILL_MAX_AREA_RCP_F32; + } +} + +// +// +// + +static +void +skc_tile_color_fill_solid(__global union skc_styling_cmd const * SKC_RESTRICT const commands, + uint * SKC_RESTRICT const cmd_next, + union skc_tile_color * SKC_RESTRICT const color) +{ + // + // rgba = solid fill + // + __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0; + + *cmd_next += 2; + +#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) + + SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + color->aN.rgba[ii].r = rg.lo; + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + color->aN.rgba[ii].g = rg.hi; + + SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + color->aN.rgba[ii].b = ba.lo; + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + color->aN.rgba[ii].a = ba.hi; + +#else + + SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); + SKC_RENDER_TILE_COLOR const r = rg.lo; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) + color->vN.rgba[ii].even.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(r); + + SKC_RENDER_TILE_COLOR const g = rg.hi; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) + color->vN.rgba[ii].odd.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(g); + + SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); + SKC_RENDER_TILE_COLOR const b = ba.lo; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) + color->vN.rgba[ii].even.odd = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(b); + + SKC_RENDER_TILE_COLOR const a = ba.hi; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) + color->vN.rgba[ii].odd.odd = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(a); + +#endif +} + +// +// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++" +// +// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/ +// +// Lerp in two fma/mad ops: +// +// t * b + ((-t) * a + a) +// +// Note: OpenCL documents mix() as being implemented as: +// +// a + (b - a) * t +// +// But this may be a native instruction on some devices. For example, +// on GEN9 there is an LRP "linear interoplation" function but it +// doesn't appear to support half floats. +// + +#if 1 +#define SKC_LERP(a,b,t) mad(t,b,mad(-(t),a,a)) +#else +#define SKC_LERP(a,b,t) mix(a,b,t) +#endif + +// +// CPUs have a mock local address space so copying the gradient header +// is probably not useful. Just read directly from global. +// + +#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL +#define SKC_RENDER_GRADIENT_SPACE __local +#else +#define SKC_RENDER_GRADIENT_SPACE __global +#endif + +// +// gradient is non-vertical +// +// removed the vertical (actually, horizontal) special case +// + +static +void +skc_tile_color_fill_gradient_linear_nonvertical(__local union skc_subgroup_smem * SKC_RESTRICT const smem, + __global union skc_styling_cmd const * SKC_RESTRICT const commands, + uint * SKC_RESTRICT const cmd_next, + union skc_tile_color * SKC_RESTRICT const color, + skc_ttck_hi_t const ttck_hi) +{ + // + // Where is this tile? + // + // Note that the gradient is being sampled from pixel centers. + // + SKC_RENDER_GRADIENT_FLOAT const y = +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) I##.5f P + (SKC_RENDER_GRADIENT_FLOAT)( SKC_RENDER_SCANLINE_VECTOR_EXPAND() ) + + (skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE)); + + float const x = 0.5f + (skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH); + + // + // Get starting numerator and denominator + // + // Note: if gh[0].dx is exactly 0.0f then this is a vertical + // gradient and can be handled by a special opcode. + // + // Note: the mad() ordering is slightly different than the original + // CUDA implementation. + // + union skc_gradient_vector const gv = { vload4(0,&commands[*cmd_next].f32) }; + + *cmd_next += 4; + + float const gv_x_dot = mad(x,gv.dx,gv.p0); + SKC_RENDER_GRADIENT_FLOAT const gv_numer = mad(y,gv.dy,gv_x_dot); + + // + // Where are columns along gradient vector? + // + // TODO: Note that the gv_denom isn't multiplied through. + // + // Please doublecheck this... but I recall that in certain cases + // this wipes out some precision and results in minor but noticeable + // gradient artifacts. + // + // All arguments are scalars except gv_numer so a simpler + // evaluation might save some flops. + // + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + color->grad[ii].distance = mad(gv.dx,(float)ii,gv_numer) * gv.denom; + + // + // is gradient non-repeating, repeating or reflecting? + // + switch (commands[(*cmd_next)++].u32) + { + case SKC_STYLING_GRADIENT_TYPE_LINEAR_NON_REPEATING: + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + color->grad[ii].distance = clamp(color->grad[ii].distance,0.0f,1.0f); + break; + + case SKC_STYLING_GRADIENT_TYPE_LINEAR_REPEATING: + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + color->grad[ii].distance -= floor(color->grad[ii].distance); + break; + + default: // PXL_STYLING_GRADIENT_TYPE_LINEAR_REFLECTING + // + // OPTIMIZATION: Can this be done in fewer than ~4 ops? + // + // Note: OpenCL "rint()" is round-to-nearest-even integer! + // + // Note: the floor() "round to -inf" op is implemented in the + // GEN op 'FRC' so probably don't use trunc() when floor will + // suffice. + // + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + { + SKC_RENDER_GRADIENT_FLOAT dist_abs = fabs(color->grad[ii].distance); + color->grad[ii].distance = fabs(dist_abs - rint(dist_abs)); + } + } + + // + // initialize "stoplerp" for all columns + // + uint const slope_count = commands[(*cmd_next)++].u32; + uint const gd_n_v1 = commands[(*cmd_next)++].u32; // REMOVE ME + + { + float const slope = commands[(*cmd_next)++].f32; + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + color->grad[ii].stoplerp = color->grad[ii].distance * slope; + } + + // + // compute stoplerp for remaining stops + // + for (int jj=1; jj<slope_count; jj++) + { + float const floor = (float)jj; + float const slope = commands[(*cmd_next)++].f32; + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + color->grad[ii].stoplerp = mad(min(0, color->grad[ii].stoplerp - floor),slope,color->grad[ii].stoplerp); + } + + // + // copy gradient colors to local memory + // + uint const gd_n = slope_count + 1; + +#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL + // + // copy entire gradient descriptor to local memory + // + for (uint ii=skc_subgroup_lane(); ii<gd_n*4; ii+=SKC_RENDER_SUBGROUP_SIZE) + smem->cmds[ii].u32 = commands[*cmd_next + ii].u32; + + __local half const * const SKC_RESTRICT gc = smem->gc + 0; +#else + // + // prefetch entire gradient header + // + // no noticeable impact on performance + // + // prefetch(&commands[*cmd_next].u32,gh_words); + // + __global half const * const SKC_RESTRICT gc = commands[*cmd_next].f16a2 + 0; +#endif + + // + // adjust cmd_next so that V1 structure is consumed -- FIXME + // + *cmd_next += SKC_GRADIENT_CMD_WORDS_V2_ADJUST(gd_n_v1,gd_n); + + // + // lerp between color pair stops + // + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + { + // + // Finally, we have the gradient stop index and the color stop + // pair lerp fraction + // + // Note that if these are vector values then a gather operation + // must occur -- there may be platforms (AVX-512?) that can + // perform an explicit gather on a vector type but it's not + // really expressible in OpenCL except implicitly with a + // workgroup of work items. + // + // *********************** + // + // FIXME -- USE HERB'S SINGLE FMA LERP + // + // *********************** + // + SKC_RENDER_GRADIENT_STOP const gc_stop = SKC_CONVERT(SKC_RENDER_GRADIENT_STOP)(color->grad[ii].stoplerp); + SKC_RENDER_GRADIENT_FRAC const gc_frac = SKC_CONVERT(SKC_RENDER_GRADIENT_FRAC)(color->grad[ii].stoplerp - floor(color->grad[ii].stoplerp)); + + { + SKC_RENDER_TILE_COLOR lo, hi; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) { \ + SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + 0,gc); \ + lo C = cc.lo; \ + hi C = cc.hi; \ + } + + SKC_RENDER_SCANLINE_VECTOR_EXPAND(); + + color->aN.rgba[ii].r = SKC_LERP(lo,hi,gc_frac); + } + + // + // + // + { + SKC_RENDER_TILE_COLOR lo, hi; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) { \ + SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n,gc); \ + lo C = cc.lo; \ + hi C = cc.hi; \ + } + + SKC_RENDER_SCANLINE_VECTOR_EXPAND(); + + color->aN.rgba[ii].g = SKC_LERP(lo,hi,gc_frac); + } + + // + // + // + { + SKC_RENDER_TILE_COLOR lo, hi; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) { \ + SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*2,gc); \ + lo C = cc.lo; \ + hi C = cc.hi; \ + } + + SKC_RENDER_SCANLINE_VECTOR_EXPAND(); + + color->aN.rgba[ii].b = SKC_LERP(lo,hi,gc_frac); + } + + // + // + // + { + SKC_RENDER_TILE_COLOR lo, hi; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) { \ + SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*3,gc); \ + lo C = cc.lo; \ + hi C = cc.hi; \ + } + + SKC_RENDER_SCANLINE_VECTOR_EXPAND(); + + color->aN.rgba[ii].a = SKC_LERP(lo,hi,gc_frac); + } + } +} + +// +// +// + +static +void +skc_tile_blend_over(union skc_tile_color * SKC_RESTRICT const color_acc, + union skc_tile_cover const * SKC_RESTRICT const cover_wip, + union skc_tile_color const * SKC_RESTRICT const color_wip) +{ + // + // fralunco = cover.wip * acc.a + // + // acc.r = fralunco * wip.r + acc.r + // acc.g = fralunco * wip.g + acc.g + // acc.b = fralunco * wip.b + acc.b + // acc.a = -fralunco * wip.a + acc.a + // + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + { + SKC_RENDER_TILE_COVER const fralunco = cover_wip->aN.c[ii] * color_acc->aN.rgba[ii].a; + + color_acc->aN.rgba[ii].r = mad(+fralunco,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); + color_acc->aN.rgba[ii].g = mad(+fralunco,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); + color_acc->aN.rgba[ii].b = mad(+fralunco,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); + color_acc->aN.rgba[ii].a = mad(-fralunco,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); + } +} + +// +// +// + +static +void +skc_tile_blend_plus(union skc_tile_color * SKC_RESTRICT const color_acc, + union skc_tile_cover const * SKC_RESTRICT const cover_wip, + union skc_tile_color const * SKC_RESTRICT const color_wip) +{ + // + // cover_min = min(cover.wip,a.acc) + // + // r.acc = cover_min * r.wip + r.acc + // g.acc = cover_min * g.wip + g.acc + // b.acc = cover_min * b.wip + b.acc + // a.acc = -cover_min * a.wip + a.acc + // + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + { + SKC_RENDER_TILE_COVER const cover_min = fmin(cover_wip->aN.c[ii],color_acc->aN.rgba[ii].a); + + color_acc->aN.rgba[ii].r = mad(+cover_min,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); + color_acc->aN.rgba[ii].g = mad(+cover_min,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); + color_acc->aN.rgba[ii].b = mad(+cover_min,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); + color_acc->aN.rgba[ii].a = mad(-cover_min,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); + } +} + +// +// +// + +static +void +skc_tile_blend_multiply(union skc_tile_color * SKC_RESTRICT const color_acc, + union skc_tile_cover const * SKC_RESTRICT const cover_wip, + union skc_tile_color const * SKC_RESTRICT const color_wip) +{ + // + // r.acc = (cover.wip * r.wip) * r.acc + // g.acc = (cover.wip * g.wip) * g.acc + // b.acc = (cover.wip * b.wip) * b.acc + // a.acc = (cover.wip * a.wip) * (1.0 - a.acc) <-- a.acc is already (1.0 - alpha) + // + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + { + color_acc->aN.rgba[ii].r *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].r; + color_acc->aN.rgba[ii].g *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].g; + color_acc->aN.rgba[ii].b *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].b; + color_acc->aN.rgba[ii].a *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].a; + } +} + +// +// +// + +static +void +skc_tile_blend_knockout(union skc_tile_cover * SKC_RESTRICT const cover_acc, + union skc_tile_color * SKC_RESTRICT const color_acc, + union skc_tile_cover const * SKC_RESTRICT const cover_wip, + union skc_tile_color const * SKC_RESTRICT const color_wip) +{ + // + // cover.wip.contrib = (1.0 - cover.acc) * cover.wip + // cover.acc = cover.acc + cover.wip.contrib + // + // r.acc = cover.wip.contrib * r.wip + r.acc + // g.acc = cover.wip.contrib * g.wip + g.acc + // b.acc = cover.wip.contrib * b.wip + b.acc + // a.acc = -cover.wip.contrib * a.wip * a.acc + // + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + { + SKC_RENDER_TILE_COVER const contrib = (1 - cover_acc->aN.c[ii]) * cover_wip->aN.c[ii]; + + cover_acc->aN.c[ii] += contrib; + + color_acc->aN.rgba[ii].r = mad(+contrib,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); + color_acc->aN.rgba[ii].g = mad(+contrib,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); + color_acc->aN.rgba[ii].b = mad(+contrib,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); + color_acc->aN.rgba[ii].a = mad(-contrib,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); + } +} + +// +// +// + +static +void +skc_tile_cover_msk_copy_wip(union skc_tile_cover * SKC_RESTRICT const cover_msk, + union skc_tile_cover const * SKC_RESTRICT const cover_wip) +{ +#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + cover_msk->aN.c[ii] = cover_wip->aN.c[ii]; + +#else + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) + cover_msk->vN.c[ii] = cover_wip->vN.c[ii]; + +#endif +} + +// +// +// + +static +void +skc_tile_cover_msk_copy_acc(union skc_tile_cover * SKC_RESTRICT const cover_msk, + union skc_tile_cover const * SKC_RESTRICT const cover_acc) +{ +#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + cover_msk->aN.c[ii] = cover_acc->aN.c[ii]; + +#else + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNTN))) + for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) + cover_msk->vN.c[ii] = cover_acc->vN.c[ii]; + +#endif +} + +// +// +// + +static +void +skc_tile_cover_accumulate(union skc_tile_cover * SKC_RESTRICT const cover_acc, + union skc_tile_cover const * SKC_RESTRICT const cover_wip) +{ + // + // cover.wip.contrib = (1.0 - cover.acc) * cover.wip + // cover.acc = cover.acc + cover.wip.contrib + // + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + cover_acc->aN.c[ii] = mad(1 - cover_acc->aN.c[ii],cover_wip->aN.c[ii],cover_acc->aN.c[ii]); +} + +// +// +// + +static +void +skc_tile_cover_wip_mask(union skc_tile_cover * SKC_RESTRICT const cover_wip, + union skc_tile_cover const * SKC_RESTRICT const cover_msk) +{ + // + // cover.wip *= cover.msk + // + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + cover_wip->aN.c[ii] *= cover_msk->aN.c[ii]; +} + +// +// +// + +static +void +skc_tile_cover_wip_zero(union skc_tile_cover * SKC_RESTRICT const cover) +{ +#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + cover->aN.c[ii] = 0; + +#else + // + // GEN9 compiler underperforms on this + // + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) + cover->vN.c[ii] = 0; + +#endif +} + +static +void +skc_tile_cover_acc_zero(union skc_tile_cover * SKC_RESTRICT const cover) +{ +#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + cover->aN.c[ii] = 0; + +#else + // + // GEN9 compiler underperforms on this + // + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) + cover->vN.c[ii] = 0; + +#endif +} + +static +void +skc_tile_cover_msk_zero(union skc_tile_cover * SKC_RESTRICT const cover) +{ +#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + cover->aN.c[ii] = 0; + +#else + // + // GEN9 compiler underperforms on this + // + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) + cover->vN.c[ii] = 0; + +#endif +} + +// +// +// + +static +void +skc_tile_cover_msk_one(union skc_tile_cover * SKC_RESTRICT const cover) +{ +#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + cover->aN.c[ii] = 1; + +#else + // + // GEN9 compiler underperforms on this + // + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) + cover->vN.c[ii] = SKC_RENDER_TILE_COVER_VECTOR_ONE; + +#endif +} + +// +// +// + +static +void +skc_tile_cover_msk_invert(union skc_tile_cover * SKC_RESTRICT const cover) +{ +#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + cover->aN.c[ii] = 1 - cover->aN.c[ii]; + +#else + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) + cover->vN.c[ii] = 1 - cover->vN.c[ii]; + +#endif +} + +// +// +// + +static +void +skc_tile_color_wip_zero(union skc_tile_color * SKC_RESTRICT const color) +{ +#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + { + color->aN.rgba[ii].r = 0; + color->aN.rgba[ii].g = 0; + color->aN.rgba[ii].b = 0; + color->aN.rgba[ii].a = 1; + } + +#else + // + // DISABLED ON GEN9 -- probably a compiler bug + // + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) + color->vN.rgba[ii].even.even = 0; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) + color->vN.rgba[ii].odd.even = 0; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) + color->vN.rgba[ii].even.odd = 0; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) + color->vN.rgba[ii].odd.odd = 1; +#endif +} + +static +void +skc_tile_color_acc_zero(union skc_tile_color * SKC_RESTRICT const color) +{ +#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 ) + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + { + color->aN.rgba[ii].r = 0; + color->aN.rgba[ii].g = 0; + color->aN.rgba[ii].b = 0; + color->aN.rgba[ii].a = 1; + } + +#else + // + // DISABLED ON GEN9 -- probably a compiler bug + // + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) + color->vN.rgba[ii].even.even = 0; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) + color->vN.rgba[ii].odd.even = 0; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) + color->vN.rgba[ii].even.odd = 0; + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) + for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) + color->vN.rgba[ii].odd.odd = 1; +#endif +} + +// +// +// + +static +bool +skc_tile_color_test_opacity(union skc_tile_color const * SKC_RESTRICT const color) +{ + // + // returns true if tile is opaque + // + // various hacks to test for complete tile opacity + // + // note that front-to-back currently has alpha at 0.0f -- this can + // be harmonized to use a traditional alpha if we want to support + // rendering in either direction + // + // hack -- ADD/MAX/OR all alphas together and test for non-zero + // + SKC_RENDER_TILE_COLOR t = color->aN.rgba[0].a; + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) + for (uint ii=1; ii<SKC_TILE_WIDTH; ii++) + t += color->aN.rgba[ii].a; + +#if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) + // + // SIMD + // + return !any(t != ( 0 )); + +#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 ) + // + // SIMT - scalar per lane + // + return !sub_group_any(t != 0); + +#else + // + // SIMT - vector per lane + // + return !sub_group_any(any(t != ( 0 ))); + +#endif + + // + // TODO: The alternative vector-per-lane implementation below is + // *not* believed to be performant because the terse vector-wide + // test is just hiding a series of comparisons and is likely worse + // than the blind ADD/MAX/OR'ing of all alphas followed by a single + // test. + // +#if 0 + // + // SIMT - vector per lane + // + + // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT-1))) + for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) + { + if (sub_group_any(any(color->vN.ba[ii].a != ( 0 )))) + return false; + } + + return true; +#endif +} + +// +// +// + +static +void +skc_tile_background_over(__global union skc_styling_cmd const * SKC_RESTRICT const commands, + uint * SKC_RESTRICT const cmd_next, + union skc_tile_color * SKC_RESTRICT const color) +{ + // + // acc.r = acc.a * r + acc.r + // acc.g = acc.a * g + acc.g + // acc.b = acc.a * b + acc.b + // + __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0; + + *cmd_next += 2; + + SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + color->aN.rgba[ii].r = mad(color->aN.rgba[ii].a,rg.lo,color->aN.rgba[ii].r); + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + color->aN.rgba[ii].g = mad(color->aN.rgba[ii].a,rg.hi,color->aN.rgba[ii].g); + + SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + color->aN.rgba[ii].b = mad(color->aN.rgba[ii].a,ba.lo,color->aN.rgba[ii].b); +} + +// +// +// + +// #define SKC_SURFACE_IS_BUFFER +#ifdef SKC_SURFACE_IS_BUFFER + +static +void +skc_surface_composite_u8_rgba(__global SKC_RENDER_SURFACE_U8_RGBA * SKC_RESTRICT const surface, + skc_uint const surface_pitch, + union skc_tile_color const * SKC_RESTRICT const color, + skc_ttck_hi_t const ttck_hi) +{ + // + // NEW MAJOR OPTIMIZATION: + // + // Rotating and rasterizing the original world transform by -90 + // degrees and then rendering the scene scene by +90 degrees enables + // all the final surface composite to be perfomed in perfectly + // coalesced wide transactions. + // + // For this reason, linear access to the framebuffer is preferred. + // + // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv + // + // NOTE THIS IS TRANSPOSED BY 90 DEGREES + // + // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE + // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING. + // + // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS + // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS + // + // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL + // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER + // + uint const pitch = surface_pitch / SKC_RENDER_SCANLINE_VECTOR_SIZE; + uint const x = skc_ttck_hi_get_x(ttck_hi); + uint const y = skc_ttck_hi_get_y(ttck_hi) ; + uint const base = x * SKC_TILE_WIDTH * pitch + y * (SKC_TILE_HEIGHT / SKC_RENDER_SCANLINE_VECTOR_SIZE) + skc_subgroup_lane(); + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + { + SKC_RENDER_SURFACE_U8_RGBA rgba = ( 0xFF000000 ); + + rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].r * 255); + rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].g * 255) << 8; + rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].b * 255) << 16; + + surface[base + ii * pitch] = rgba; + + // printf("%08v2X\n",rgba); + } +} + +#else + +static +void +skc_surface_composite_u8_rgba(__write_only image2d_t surface, + union skc_tile_color const * SKC_RESTRICT const color, + skc_ttck_hi_t const ttck_hi) +{ + // + // NEW MAJOR OPTIMIZATION: + // + // Rotating and rasterizing the original world transform by -90 + // degrees and then rendering the scene scene by +90 degrees enables + // all the final surface composite to be perfomed in perfectly + // coalesced wide transactions. + // + // For this reason, linear access to the framebuffer is preferred. + // + // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv + // + // NOTE THIS IS TRANSPOSED BY 90 DEGREES + // + // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE + // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING. + // + // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS + // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS + // + // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL + // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER + // + +#if 1 + int x = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH; + int y = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE); + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + { +#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) { \ + SKC_RENDER_SURFACE_WRITE(surface, \ + (int2)(x,y+I), \ + color->iN.rgba[ii] A); \ + } + +#else + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) { \ + SKC_RENDER_SURFACE_COLOR const rgba = \ + (SKC_RENDER_SURFACE_COLOR) \ + (color->aN.rgba[ii].r C, \ + color->aN.rgba[ii].g C, \ + color->aN.rgba[ii].b C, \ + 1.0); \ + SKC_RENDER_SURFACE_WRITE(surface,(int2)(x,y+I),rgba); \ + } + +#endif + + SKC_RENDER_SCANLINE_VECTOR_EXPAND(); + + x += 1; + } +#else + int x = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE); + int y = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH; + + // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) + for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) + { +#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) { \ + SKC_RENDER_SURFACE_WRITE(surface, \ + (int2)(x+I,y+ii), \ + color->iN.rgba[ii] A); \ + } + +#else + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,A) { \ + SKC_RENDER_SURFACE_COLOR const rgba = \ + (SKC_RENDER_SURFACE_COLOR) \ + (color->aN.rgba[ii].r C, \ + color->aN.rgba[ii].g C, \ + color->aN.rgba[ii].b C, \ + 1.0); \ + SKC_RENDER_SURFACE_WRITE(surface,(int2)(x+I,y+ii),rgba); \ + } + +#endif + + SKC_RENDER_SCANLINE_VECTOR_EXPAND(); + } + +#endif +} + +#endif + +// +// +// +static +uint const +skc_ttck_lane(uint const ttck_idx) +{ + return ttck_idx & SKC_RENDER_SUBGROUP_MASK; +} + +// +// RENDER KERNEL +// + +__kernel +SKC_RENDER_KERNEL_ATTRIBS +void +skc_kernel_render(__global union skc_layer_node const * SKC_RESTRICT const layers, + __global struct skc_group_node const * SKC_RESTRICT const groups, + __global union skc_styling_cmd const * SKC_RESTRICT const commands, // FIXME -- rename + + __global skc_ttck_t const * SKC_RESTRICT const ttck_keys, // rename: keys + skc_uint const ttck_count, // rename: key_count + + __global uint const * SKC_RESTRICT const ttck_offsets, // rename: offsets + skc_uint const tile_count, // rename: offset_count + + __global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, +#ifdef SKC_SURFACE_IS_BUFFER + __global void * SKC_RESTRICT const surface, +#else + __write_only image2d_t surface, +#endif +#ifdef SKC_SURFACE_IS_BUFFER + skc_uint const surface_pitch, +#endif + uint4 const tile_clip) // rename: clip +{ + // + // Each subgroup is responsible for a tile. No extra subgroups are + // launched. + // + // FIXME -- might be better implemented as a "grid stride loop" if + // Intel GEN really has a local memory "quantum" of 4KB which means + // we would need to launch 4 subgroups per workgroup. + // + // Confirmed: GEN8 has 4KB SLM workgroup min while GEN9 is 1KB. + // + + // + // declare tile cover and color registers + // + // this used to be a neat unified struct but the Intel GEN compiler + // wasn't cooperating and spilling to private memory even though all + // registers were indexed by constants + // + union skc_tile_color color_wip; + union skc_tile_color color_acc; + + union skc_tile_cover cover_wip; + union skc_tile_cover cover_acc; + union skc_tile_cover cover_msk; + + // + // which subgroup in the grid is this? + // + // TAKE NOTE: the Intel GEN compiler is recognizing get_group_id(0) + // as a uniform but the alternative calculation used when there are + // multiple subgroups per workgroup is not cooperating and + // driving spillage elsewhere. + // +#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 ) + skc_uint const ttck_offset_idx = get_group_id(0); +#else + skc_uint const ttck_offset_idx = get_group_id(0) * SKC_RENDER_WORKGROUP_SUBGROUPS + get_sub_group_id(); +#endif + + // + // load the starting ttck for this offset and get a bound on the max + // number of keys that might be loaded + // + // these are uniform across all subgroup lanes + // + skc_uint ttck_idx = ttck_offsets[ttck_offset_idx]; + + // + // FIXME -- SIMD/CPU version should probaby load a 256-bit (4-wide) + // vector of ttck keys + // +#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK + + skc_ttck_t ttck = ttck_keys[ttck_idx]; + +#else + + uint const ttck_base = ttck_idx & ~SKC_RENDER_SUBGROUP_MASK; + uint const ttck_lane = ttck_idx & SKC_RENDER_SUBGROUP_MASK; + skc_ttck_t ttck_s = ttck_keys[min(ttck_base+max(get_sub_group_local_id(),ttck_lane),ttck_count-1)] + +#endif + + // + // set up style group/layer state + // + struct skc_styling_group { + union skc_group_range range; + skc_uint depth; + skc_uint id; + } group; + + group.range.lo = 0; + group.range.hi = SKC_UINT_MAX; + group.depth = 0; + group.id = SKC_UINT_MAX; + + // + // start with clear tile opacity, knockout and flag bits + // + // uint color_acc_opacity = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32 + // uint cover_acc_knockout = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32 + // + skc_uint flags = 0; + + // + // declare and initialize accumulators + // +#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 ) + __local union skc_subgroup_smem smem[1]; +#else + __local union skc_subgroup_smem smem_wg[SKC_RENDER_WORKGROUP_SUBGROUPS]; + __local union skc_subgroup_smem * SKC_RESTRICT const smem = smem_wg + get_sub_group_id(); +#endif + +#ifdef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK + // + // select the initial ttck key + // + skc_ttck_t ttck; +#if 0 + ttck = sub_group_broadcast(ttck_s,ttck_lane); // SHOULD WORK BUT .4454 COMPILER IS BROKEN +#else + ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane); // EXPLICIT WORKAROUND + ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane); +#endif + +#endif + + // + // save the first key so we know what tile we're in + // + skc_ttck_t ttck0 = ttck; + + // + // evaluate the coarse clip as late as possible + // + skc_uint const ttck_hi_x = skc_ttck_hi_get_x(ttck0.hi); + + if ((ttck_hi_x < tile_clip.lo.x) || (ttck_hi_x >= tile_clip.hi.x)) + return; + + skc_uint const ttck_hi_y = skc_ttck_hi_get_y(ttck0.hi); + + if ((ttck_hi_y < tile_clip.lo.y) || (ttck_hi_y >= tile_clip.hi.y)) + return; + +#if 0 + printf("< %u, %u >\n",ttck_hi_x,ttck_hi_y); +#endif + + // + // load -> scatter -> flush + // + while (true) + { + // if scattering is disabled then just run through ttck keys + bool const is_scatter_enabled = (flags & SKC_TILE_FLAGS_SCATTER_SKIP) == 0; + + // need to clear accumulators before a scatter loop + if (is_scatter_enabled) + { + skc_tile_aa_zero(smem); + } + + do { + // skip scattering? + if (is_scatter_enabled) + { + skc_block_id_t const xb_id = skc_ttck_lo_get_ttxb_id(ttck.lo); + + if (skc_ttck_lo_is_prefix(ttck.lo)) { + skc_scatter_ttpb(ttxb_extent,smem,xb_id); + } else { + skc_scatter_ttsb(ttxb_extent,smem,xb_id); + } + } + + // + // any ttck keys left? + // + if (++ttck_idx >= ttck_count) + { + flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE; + break; + } + + // + // process next ttck key + // +#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK + // + // SIMD -- read next key + // + ttck = ttck_keys[ttck_idx]; +#else + // + // SIMT -- refresh the ttck_s? + // + uint const ttck_lane_next = ttck_idx & SKC_RENDER_SUBGROUP_MASK; + + if (ttck_lane_next == 0) + ttck_s = ttck_keys[min(ttck_idx+get_sub_group_local_id(),ttck_count-1)]; + + // + // broadcast next key to entire subgroup + // +#if 0 + ttck = sub_group_broadcast(ttck_s,ttck_lane_next); // SHOULD WORK BUT .4454 COMPILER IS BROKEN +#else + ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane_next); // EXPLICIT WORKAROUND + ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane_next); +#endif +#endif + // continue scattering if on same YXL layer + } while (skc_ttck_equal_yxl(ttck0,ttck)); + + // finalize if no longer on same YX tile + if (!skc_ttck_hi_equal_yx(ttck0.hi,ttck.hi)) + { + // otherwise, unwind the tile styling and exit + flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE; + } + + // + // given: new layer id from ttxk key + // + // load [layer id]{ group id, depth } + // + // if within current group's layer range + // + // if at same depth + // + // load and execute cover>[mask>]color>blend commands + // + // else if not at same depth then move deeper + // + // for all groups in group trail from cur depth to new depth + // enter group, saving and initializing regs as necessary + // increment depth and update layer range + // load and execute cover>[mask>]color>blend commands + // + // else not within layer range + // + // exit current group, restoring regs as necessary + // decrement depth and update layer range + // + // + skc_layer_id const layer_id_new = skc_ttck_get_layer(ttck0); // FIXME -- this was ttck_hi + union skc_layer_node const layer_node_new = layers[layer_id_new]; + + // clear flag that controls group/layer traversal + flags &= ~SKC_TILE_FLAGS_FLUSH_COMPLETE; + + do { + bool const unwind = (flags & SKC_TILE_FLAGS_FLUSH_UNWIND) != 0; + + // + // is layer a child of the current parent group? + // + uint cmd_next = 0; + + if (!unwind && (layer_node_new.parent == group.id)) + { + // execute this layer's cmds + cmd_next = layer_node_new.cmds; + + // if this is final then configure so groups get unwound, otherwise we're done + flags |= ((flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) ? SKC_TILE_FLAGS_FLUSH_UNWIND : SKC_TILE_FLAGS_FLUSH_COMPLETE); + } + else if (!unwind && (layer_id_new >= group.range.lo && layer_id_new <= group.range.hi)) + { + // + // is layer in a child group? + // + union skc_group_parents const gp = groups[layer_node_new.parent].parents; + uint const gn = gp.depth - ++group.depth; + + if (gn == 0) + group.id = layer_node_new.parent; + else + group.id = commands[gp.base + gn - 1].parent; + + // update group layer range + group.range = groups[group.id].range; + + // enter current group + cmd_next = groups[group.id].cmds.enter; + } + else // otherwise, exit this group + { + // enter current group + cmd_next = groups[group.id].cmds.leave; + + // decrement group depth + if (--group.depth == 0) + { + flags |= SKC_TILE_FLAGS_FLUSH_COMPLETE; + } + else + { + // get path_base of current group + uint const gnpb = groups[group.id].parents.base; + + // get parent of current group + group.id = commands[gnpb].parent; + + // update group layer range + group.range = groups[group.id].range; + } + } + + // + // execute cmds + // + while (true) + { + union skc_styling_cmd const cmd = commands[cmd_next++]; + + switch (cmd.u32 & SKC_STYLING_OPCODE_MASK_OPCODE) + { + case SKC_STYLING_OPCODE_NOOP: + break; + + case SKC_STYLING_OPCODE_COVER_NONZERO: + skc_tile_cover_nonzero(smem,&cover_wip,&color_wip); + break; + + case SKC_STYLING_OPCODE_COVER_EVENODD: + skc_tile_cover_evenodd(smem,&cover_wip,&color_wip); + break; + + case SKC_STYLING_OPCODE_COVER_ACCUMULATE: + skc_tile_cover_accumulate(&cover_acc,&cover_wip); + break; + + case SKC_STYLING_OPCODE_COVER_MASK: + skc_tile_cover_wip_mask(&cover_wip,&cover_msk); + break; + + case SKC_STYLING_OPCODE_COVER_WIP_ZERO: + skc_tile_cover_wip_zero(&cover_wip); + break; + + case SKC_STYLING_OPCODE_COVER_ACC_ZERO: + skc_tile_cover_acc_zero(&cover_acc); + break; + + case SKC_STYLING_OPCODE_COVER_MASK_ZERO: + skc_tile_cover_msk_zero(&cover_msk); + break; + + case SKC_STYLING_OPCODE_COVER_MASK_ONE: + skc_tile_cover_msk_one(&cover_msk); + break; + + case SKC_STYLING_OPCODE_COVER_MASK_INVERT: + skc_tile_cover_msk_invert(&cover_msk); + break; + + case SKC_STYLING_OPCODE_COLOR_FILL_SOLID: + skc_tile_color_fill_solid(commands,&cmd_next,&color_wip); + break; + + case SKC_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR: + // + // FIXME -- gradients shouldn't be executing so much + // conditional driven code at runtime since we *know* + // the gradient style on the host can just create a + // new styling command to exploit this. + // + // FIXME -- it might be time to try using the GPU's + // sampler on a linear array of half4 vectors -- it + // might outperform the explicit load/lerp routines. + // + // FIXME -- optimizing for vertical gradients (uhhh, + // they're actually horizontal due to the -90 degree + // view transform) is nice but is it worthwhile to + // have this in the kernel? Easy to add it back... + // +#if defined( SKC_ARCH_GEN9 ) + // disable gradients due to exessive spillage -- fix later + cmd_next += SKC_GRADIENT_CMD_WORDS_V1(commands[cmd_next+6].u32); +#else + skc_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi); +#endif + break; + + case SKC_STYLING_OPCODE_COLOR_WIP_ZERO: + skc_tile_color_wip_zero(&color_wip); + break; + + case SKC_STYLING_OPCODE_COLOR_ACC_ZERO: + skc_tile_color_acc_zero(&color_acc); + break; + + case SKC_STYLING_OPCODE_BLEND_OVER: + skc_tile_blend_over(&color_acc,&cover_wip,&color_wip); + break; + + case SKC_STYLING_OPCODE_BLEND_PLUS: + skc_tile_blend_plus(&color_acc,&cover_wip,&color_wip); + break; + + case SKC_STYLING_OPCODE_BLEND_MULTIPLY: + skc_tile_blend_multiply(&color_acc,&cover_wip,&color_wip); + break; + + case SKC_STYLING_OPCODE_BLEND_KNOCKOUT: + skc_tile_blend_knockout(&cover_acc,&color_acc,&cover_wip,&color_wip); + break; + + case SKC_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK: + // skc_tile_cover_msk_copy_wip(&cover_msk,&cover_wip); + break; + + case SKC_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK: + // skc_tile_cover_msk_copy_acc(&cover_msk,&cover_acc); + break; + + case SKC_STYLING_OPCODE_BACKGROUND_OVER: + skc_tile_background_over(commands,&cmd_next,&color_acc); + break; + + case SKC_STYLING_OPCODE_SURFACE_COMPOSITE: +#ifdef SKC_SURFACE_IS_BUFFER + skc_surface_composite_u8_rgba(surface,surface_pitch,&color_acc,ttck0.hi); +#else + skc_surface_composite_u8_rgba(surface, &color_acc,ttck0.hi); +#endif + break; + + case SKC_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY: + if (skc_tile_color_test_opacity(&color_acc)) + flags |= SKC_TILE_FLAGS_SCATTER_SKIP; + break; + + default: + return; // this is an illegal opcode -- trap and die! + } + + // + // if sign bit is set then this was final command + // + if (cmd.s32 < 0) + break; + } + + // continue as long as tile flush isn't complete + } while ((flags & SKC_TILE_FLAGS_FLUSH_COMPLETE) == 0); + + // return if was the final flush + if (flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) + return; + + // update wip ttck_hi + ttck0 = ttck; + } +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl b/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl index 378d51d8d7..7f48978782 100644 --- a/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl +++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl @@ -1,130 +1,130 @@ -/*
- * Copyright 2018 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE
-// LAYOUT OF THE TTCK KEY. IF THE TTCK KEY IS ALTERED THEN THIS
-// KERNEL WILL NEED TO BE UPDATED
-//
-
-#include "tile.h"
-#include "atomic_cl.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
-#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
-
-//
-//
-//
-
-#define SKC_YX_NEQ(row,prev) \
- (((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0)
-
-//
-//
-//
-
-__kernel
-__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
-void
-skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
- __global uint * SKC_RESTRICT const indices,
- __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics)
-{
- uint const global_id = get_global_id(0);
- uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
- uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
- uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
-
- //
- // LOAD ALL THE ROWS
- //
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
-
- HS_SLAB_ROWS();
-
- //
- // LOAD LAST REGISTER FROM COLUMN TO LEFT
- //
- uint diffs = 0;
- uint2 r0 = r1;
-
- if (gmem_base > 0) {
- // if this is the first key in any slab but the first then it
- // broadcast loads the last key in previous slab
- r0.hi = as_uint2(vout[gmem_base - 1]).hi;
- } else if (get_sub_group_local_id() == 0) {
- // if this is the first lane in the first slab
- diffs = 1;
- }
-
- // now shuffle in the last key from the column to the left
- r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
-
- //
- // FIND ALL DIFFERENCES IN SLAB
- //
- uint valid = 0;
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- valid |= ((r##row != SKC_ULONG_MAX) << prev);
-
- HS_SLAB_ROWS();
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- diffs |= (SKC_YX_NEQ(row,prev) << prev);
-
- HS_SLAB_ROWS();
-
- //
- // SUM UP THE DIFFERENCES
- //
- uint const valid_diffs = valid & diffs;
- uint const count = popcount(valid_diffs);
- uint const inclusive = sub_group_scan_inclusive_add(count);
- uint const exclusive = inclusive - count;
-
- //
- // RESERVE SPACE IN THE INDICES ARRAY
- //
- uint next = 0;
-
- if (get_sub_group_local_id() == HS_LANES_PER_WARP-1)
- next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset
-
- // distribute base across subgroup
- next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1);
-
- //
- // STORE THE INDICES
- //
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- if (valid_diffs & (1 << prev)) \
- indices[next++] = lane_idx + prev;
-
- HS_SLAB_ROWS();
-
- //
- // TRANSPOSE THE SLAB AND STORE IT
- //
- HS_TRANSPOSE_SLAB();
-}
-
-//
-//
-//
+/* + * Copyright 2018 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE +// LAYOUT OF THE TTCK KEY. IF THE TTCK KEY IS ALTERED THEN THIS +// KERNEL WILL NEED TO BE UPDATED +// + +#include "tile.h" +#include "atomic_cl.h" +#include "kernel_cl_12.h" + +// +// +// + +#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP) +#define HS_LANE_MASK (HS_LANES_PER_WARP - 1) + +// +// +// + +#define SKC_YX_NEQ(row,prev) \ + (((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0) + +// +// +// + +__kernel +__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP))) +void +skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const vout, + __global uint * SKC_RESTRICT const indices, + __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics) +{ + uint const global_id = get_global_id(0); + uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB; + uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK); + uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE; + + // + // LOAD ALL THE ROWS + // +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP]; + + HS_SLAB_ROWS(); + + // + // LOAD LAST REGISTER FROM COLUMN TO LEFT + // + uint diffs = 0; + uint2 r0 = r1; + + if (gmem_base > 0) { + // if this is the first key in any slab but the first then it + // broadcast loads the last key in previous slab + r0.hi = as_uint2(vout[gmem_base - 1]).hi; + } else if (get_sub_group_local_id() == 0) { + // if this is the first lane in the first slab + diffs = 1; + } + + // now shuffle in the last key from the column to the left + r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1); + + // + // FIND ALL DIFFERENCES IN SLAB + // + uint valid = 0; + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + valid |= ((r##row != SKC_ULONG_MAX) << prev); + + HS_SLAB_ROWS(); + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + diffs |= (SKC_YX_NEQ(row,prev) << prev); + + HS_SLAB_ROWS(); + + // + // SUM UP THE DIFFERENCES + // + uint const valid_diffs = valid & diffs; + uint const count = popcount(valid_diffs); + uint const inclusive = sub_group_scan_inclusive_add(count); + uint const exclusive = inclusive - count; + + // + // RESERVE SPACE IN THE INDICES ARRAY + // + uint next = 0; + + if (get_sub_group_local_id() == HS_LANES_PER_WARP-1) + next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset + + // distribute base across subgroup + next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1); + + // + // STORE THE INDICES + // +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + if (valid_diffs & (1 << prev)) \ + indices[next++] = lane_idx + prev; + + HS_SLAB_ROWS(); + + // + // TRANSPOSE THE SLAB AND STORE IT + // + HS_TRANSPOSE_SLAB(); +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl b/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl index e9accde307..9db82d5f98 100644 --- a/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl +++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl @@ -1,394 +1,394 @@ -/*
- * Copyright 2018 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE
-// LAYOUT OF THE TTRK KEY. IF THE TTRK KEY IS ALTERED THEN THIS
-// KERNEL WILL NEED TO BE UPDATED
-//
-
-#include "tile.h"
-#include "raster_builder_cl_12.h" // need meta_in structure
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
-#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
-
-//
-// THE BEST TYPE TO ZERO SMEM
-//
-
-#define SKC_ZERO_TYPE ulong
-#define SKC_ZERO_WORDS 2
-
-//
-// THE ORDER OF COMPONENTS IS:
-//
-// 0: blocks
-// 1: offset
-// 2: pk
-// 3: rk
-//
-
-#if (HS_KEYS_PER_SLAB < 256)
-
-#define SKC_META_TYPE uint
-#define SKC_META_WORDS 1
-
-#define SKC_COMPONENT_TYPE uchar
-
-#else
-
-#define SKC_META_TYPE uint2
-#define SKC_META_WORDS 2
-
-#define SKC_COMPONENT_TYPE ushort
-
-#endif
-
-//
-//
-//
-
-#if ( SKC_TTRK_HI_BITS_COHORT <= 8)
-#define SKC_COHORT_TYPE uchar
-#else
-#define SKC_COHORT_TYPE ushort
-#endif
-
-//
-//
-//
-
-#define SKC_COHORT_ID(row) \
- as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT
-
-//
-// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED
-//
-
-#define SKC_IS_BLOCK(row) \
- ((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
-
-#define SKC_YX(row,prev) \
- (as_uint2(r##row).hi ^ as_uint2(r##prev).hi)
-
-#define SKC_IS_PK(row,prev) \
- ((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X)
-
-//
-// COHORT SIZE IS ALWAYS A POWER-OF-TWO
-// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO
-//
-// COHORT SIZE >= SUBGROUP SIZE
-//
-
-#define SKC_COHORT_SIZE (1<<SKC_TTRK_HI_BITS_COHORT)
-
-#define SKC_ZERO_RATIO (SKC_ZERO_WORDS / SKC_META_WORDS)
-#define SKC_META_ZERO_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE))
-#define SKC_META_ZERO_REM (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2))
-
-#define SKC_META_COMPONENTS 4
-#define SKC_META_COMPONENT_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE))
-
-//
-//
-//
-
-__kernel
-__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
-void
-skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
- __global uint * SKC_RESTRICT const metas)
-{
- __local union
- {
- SKC_META_TYPE volatile m[SKC_COHORT_SIZE];
- SKC_ZERO_TYPE z[SKC_META_ZERO_COUNT];
- SKC_COMPONENT_TYPE c[SKC_META_COMPONENT_COUNT];
- } shared;
-
- uint const global_id = get_global_id(0);
- uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
- uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
- uint const gmem_off = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
-
- //
- // LOAD ALL THE ROWS
- //
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
-
- HS_SLAB_ROWS();
-
- //
- // LOAD LAST REGISTER FROM COLUMN TO LEFT
- //
- uint diffs = 0;
- uint2 r0 = 0;
-
- if (gmem_base > 0) {
- // if this is the first key in any slab but the first then it
- // broadcast loads the last key in previous slab
- r0.hi = as_uint2(vout[gmem_base - 1]).hi;
- } else {
- // otherwise broadcast the first key in the first slab
- r0.hi = sub_group_broadcast(as_uint2(r1).hi,0);
- // and mark it as an implicit diff
- if (get_sub_group_local_id() == 0)
- diffs = 1;
- }
-
- // now shuffle in the last key from the column to the left
- r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
-
- // shift away y/x
- SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT;
-
- //
- // EXTRACT ALL COHORT IDS EARLY...
- //
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row);
-
- HS_SLAB_ROWS();
-
- //
- // DEBUG
- //
-#if 0
- if (gmem_base == HS_KEYS_PER_SLAB * 7)
- {
- if (get_sub_group_local_id() == 0)
- printf("\n%llX ",as_ulong(r0));
- else
- printf("%llX ",as_ulong(r0));
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- if (get_sub_group_local_id() == 0) \
- printf("\n%llX ",r##row); \
- else \
- printf("%llX ",r##row);
-
- HS_SLAB_ROWS();
- }
-#endif
-
- //
- // CAPTURE ALL CONDITIONS WE CARE ABOUT
- //
- // Diffs must be captured before cohorts
- //
- uint valid = 0;
- uint blocks = 0;
- uint pks = 0;
- SKC_COHORT_TYPE c_max = 0;
-
- //
- // FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN
- // EXPLICIT PREDICATE WILL GENERATE THE SAME CODE
- //
-#if 0
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- diffs |= ((c##row != c##prev) << prev);
-
- HS_SLAB_ROWS();
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- blocks |= (SKC_IS_BLOCK(row) << prev);
-
- HS_SLAB_ROWS();
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- pks |= SKC_IS_PK(row,prev) << prev);
-
- HS_SLAB_ROWS();
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- valid |= ((r##row != SKC_ULONG_MAX) << prev);
-
- HS_SLAB_ROWS();
-
-#else
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- if (c##row != c##prev) \
- diffs |= 1<<prev;
-
- HS_SLAB_ROWS();
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- if (SKC_IS_BLOCK(row)) \
- blocks |= 1<<prev;
-
- HS_SLAB_ROWS();
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- if (SKC_IS_PK(row,prev)) \
- pks |= 1<<prev;
-
- HS_SLAB_ROWS();
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- if (r##row != SKC_ULONG_MAX) { \
- valid |= 1<<prev; \
- c_max = max(c_max,c##row); \
- }
-
- HS_SLAB_ROWS();
-
-#endif
-
- //
- // TRANSPOSE THE SLAB AND STORE IT
- //
- HS_TRANSPOSE_SLAB();
-
- // the min cohort is the first key in the slab
- uint const c_min = sub_group_broadcast(c1,0);
-
- // the max cohort is the max across all lanes
- c_max = sub_group_reduce_max(c_max);
-
-#if 0 // REMOVE ME LATER
- if (get_sub_group_local_id() == 0)
- printf("%3u : ( %3u , %3u )\n",
- get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max);
-#endif
-
- //
- // ZERO SMEM
- //
- // zero only the meta info for the cohort ids found in this slab
- //
-#if (SKC_ZERO_WORDS >= SKC_META_WORDS)
- uint zz = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id();
- uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO;
-
- for (; zz<=zz_max; zz+=HS_LANES_PER_WARP)
- shared.z[zz] = 0;
-#else
- // ERROR -- it's highly unlikely that the zero type is smaller than
- // the meta type
-#error("Unsupported right now...")
-#endif
-
- //
- // ACCUMULATE AND STORE META INFO
- //
- uint const valid_blocks = valid & blocks;
- uint const valid_pks = valid & pks & ~diffs;
- SKC_META_TYPE meta = ( 0 );
-
-#define SKC_META_LOCAL_ADD(meta) \
- atomic_add(shared.m+HS_REG_LAST(c),meta);
-
-#define SKC_META_LOCAL_STORE(meta,prev) \
- shared.m[c##prev] = meta;
-
- // note this is purposefully off by +1
-#define SKC_META_RESET(meta,curr) \
- meta = ((gmem_off + curr) << 8);
-
-#if 0
-
- // FIXME -- this can be tweaked to shift directly
-#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
- meta += ((((blocks >> prev) & 1) ) | \
- (((pks >> prev) & 1) << 16) | \
- (((rks >> prev) & 1) << 24));
-
-#else
-
-#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
- if (blocks & (1<<prev)) \
- meta += 1; \
- if (pks & (1<<prev)) \
- meta += 1<<16; \
- if (rks & (1<<prev)) \
- meta += 1<<24;
-
-#endif
-
-#undef HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev) \
- if (diffs & (1<<prev)) { \
- SKC_META_LOCAL_STORE(meta,prev); \
- SKC_META_RESET(meta,row); \
- } \
- SKC_META_ADD(meta,prev, \
- valid_blocks, \
- valid_pks, \
- valid);
-
- HS_SLAB_ROWS();
-
- //
- // ATOMICALLY ADD THE CARRIED OUT METAS
- //
-#if 0 // BUG
- if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0))
- SKC_META_LOCAL_ADD(meta);
-#else
- if (meta != 0)
- SKC_META_LOCAL_ADD(meta);
-#endif
-
- //
- // NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE
- //
-
- // convert the slab offset to an extent offset
- bool const is_offset = (get_sub_group_local_id() & 3) == 1;
- uint const adjust = is_offset ? gmem_base - 1 : 0;
-
- //
- // only process the meta components found in this slab
- //
- uint const cc_min = c_min * SKC_META_COMPONENTS;
- uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1;
- uint cc = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id();
-
- if ((cc >= cc_min) && (cc <= cc_max))
- {
- uint const c = shared.c[cc];
-
- if (c != 0)
- atomic_add(metas+cc,c+adjust);
- }
-
- cc += HS_LANES_PER_WARP;
-
- for (; cc<=cc_max; cc+=HS_LANES_PER_WARP)
- {
- uint const c = shared.c[cc];
-
- if (c != 0)
- atomic_add(metas+cc,c+adjust);
- }
-}
-
-//
-//
-//
+/* + * Copyright 2018 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE +// LAYOUT OF THE TTRK KEY. IF THE TTRK KEY IS ALTERED THEN THIS +// KERNEL WILL NEED TO BE UPDATED +// + +#include "tile.h" +#include "raster_builder_cl_12.h" // need meta_in structure +#include "kernel_cl_12.h" + +// +// +// + +#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP) +#define HS_LANE_MASK (HS_LANES_PER_WARP - 1) + +// +// THE BEST TYPE TO ZERO SMEM +// + +#define SKC_ZERO_TYPE ulong +#define SKC_ZERO_WORDS 2 + +// +// THE ORDER OF COMPONENTS IS: +// +// 0: blocks +// 1: offset +// 2: pk +// 3: rk +// + +#if (HS_KEYS_PER_SLAB < 256) + +#define SKC_META_TYPE uint +#define SKC_META_WORDS 1 + +#define SKC_COMPONENT_TYPE uchar + +#else + +#define SKC_META_TYPE uint2 +#define SKC_META_WORDS 2 + +#define SKC_COMPONENT_TYPE ushort + +#endif + +// +// +// + +#if ( SKC_TTRK_HI_BITS_COHORT <= 8) +#define SKC_COHORT_TYPE uchar +#else +#define SKC_COHORT_TYPE ushort +#endif + +// +// +// + +#define SKC_COHORT_ID(row) \ + as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT + +// +// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED +// + +#define SKC_IS_BLOCK(row) \ + ((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) + +#define SKC_YX(row,prev) \ + (as_uint2(r##row).hi ^ as_uint2(r##prev).hi) + +#define SKC_IS_PK(row,prev) \ + ((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X) + +// +// COHORT SIZE IS ALWAYS A POWER-OF-TWO +// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO +// +// COHORT SIZE >= SUBGROUP SIZE +// + +#define SKC_COHORT_SIZE (1<<SKC_TTRK_HI_BITS_COHORT) + +#define SKC_ZERO_RATIO (SKC_ZERO_WORDS / SKC_META_WORDS) +#define SKC_META_ZERO_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE)) +#define SKC_META_ZERO_REM (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2)) + +#define SKC_META_COMPONENTS 4 +#define SKC_META_COMPONENT_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE)) + +// +// +// + +__kernel +__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP))) +void +skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout, + __global uint * SKC_RESTRICT const metas) +{ + __local union + { + SKC_META_TYPE volatile m[SKC_COHORT_SIZE]; + SKC_ZERO_TYPE z[SKC_META_ZERO_COUNT]; + SKC_COMPONENT_TYPE c[SKC_META_COMPONENT_COUNT]; + } shared; + + uint const global_id = get_global_id(0); + uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB; + uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK); + uint const gmem_off = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE; + + // + // LOAD ALL THE ROWS + // +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP]; + + HS_SLAB_ROWS(); + + // + // LOAD LAST REGISTER FROM COLUMN TO LEFT + // + uint diffs = 0; + uint2 r0 = 0; + + if (gmem_base > 0) { + // if this is the first key in any slab but the first then it + // broadcast loads the last key in previous slab + r0.hi = as_uint2(vout[gmem_base - 1]).hi; + } else { + // otherwise broadcast the first key in the first slab + r0.hi = sub_group_broadcast(as_uint2(r1).hi,0); + // and mark it as an implicit diff + if (get_sub_group_local_id() == 0) + diffs = 1; + } + + // now shuffle in the last key from the column to the left + r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1); + + // shift away y/x + SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT; + + // + // EXTRACT ALL COHORT IDS EARLY... + // +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row); + + HS_SLAB_ROWS(); + + // + // DEBUG + // +#if 0 + if (gmem_base == HS_KEYS_PER_SLAB * 7) + { + if (get_sub_group_local_id() == 0) + printf("\n%llX ",as_ulong(r0)); + else + printf("%llX ",as_ulong(r0)); +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + if (get_sub_group_local_id() == 0) \ + printf("\n%llX ",r##row); \ + else \ + printf("%llX ",r##row); + + HS_SLAB_ROWS(); + } +#endif + + // + // CAPTURE ALL CONDITIONS WE CARE ABOUT + // + // Diffs must be captured before cohorts + // + uint valid = 0; + uint blocks = 0; + uint pks = 0; + SKC_COHORT_TYPE c_max = 0; + + // + // FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN + // EXPLICIT PREDICATE WILL GENERATE THE SAME CODE + // +#if 0 + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + diffs |= ((c##row != c##prev) << prev); + + HS_SLAB_ROWS(); + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + blocks |= (SKC_IS_BLOCK(row) << prev); + + HS_SLAB_ROWS(); + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + pks |= SKC_IS_PK(row,prev) << prev); + + HS_SLAB_ROWS(); + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + valid |= ((r##row != SKC_ULONG_MAX) << prev); + + HS_SLAB_ROWS(); + +#else + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + if (c##row != c##prev) \ + diffs |= 1<<prev; + + HS_SLAB_ROWS(); + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + if (SKC_IS_BLOCK(row)) \ + blocks |= 1<<prev; + + HS_SLAB_ROWS(); + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + if (SKC_IS_PK(row,prev)) \ + pks |= 1<<prev; + + HS_SLAB_ROWS(); + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + if (r##row != SKC_ULONG_MAX) { \ + valid |= 1<<prev; \ + c_max = max(c_max,c##row); \ + } + + HS_SLAB_ROWS(); + +#endif + + // + // TRANSPOSE THE SLAB AND STORE IT + // + HS_TRANSPOSE_SLAB(); + + // the min cohort is the first key in the slab + uint const c_min = sub_group_broadcast(c1,0); + + // the max cohort is the max across all lanes + c_max = sub_group_reduce_max(c_max); + +#if 0 // REMOVE ME LATER + if (get_sub_group_local_id() == 0) + printf("%3u : ( %3u , %3u )\n", + get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max); +#endif + + // + // ZERO SMEM + // + // zero only the meta info for the cohort ids found in this slab + // +#if (SKC_ZERO_WORDS >= SKC_META_WORDS) + uint zz = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id(); + uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO; + + for (; zz<=zz_max; zz+=HS_LANES_PER_WARP) + shared.z[zz] = 0; +#else + // ERROR -- it's highly unlikely that the zero type is smaller than + // the meta type +#error("Unsupported right now...") +#endif + + // + // ACCUMULATE AND STORE META INFO + // + uint const valid_blocks = valid & blocks; + uint const valid_pks = valid & pks & ~diffs; + SKC_META_TYPE meta = ( 0 ); + +#define SKC_META_LOCAL_ADD(meta) \ + atomic_add(shared.m+HS_REG_LAST(c),meta); + +#define SKC_META_LOCAL_STORE(meta,prev) \ + shared.m[c##prev] = meta; + + // note this is purposefully off by +1 +#define SKC_META_RESET(meta,curr) \ + meta = ((gmem_off + curr) << 8); + +#if 0 + + // FIXME -- this can be tweaked to shift directly +#define SKC_META_ADD(meta,prev,blocks,pks,rks) \ + meta += ((((blocks >> prev) & 1) ) | \ + (((pks >> prev) & 1) << 16) | \ + (((rks >> prev) & 1) << 24)); + +#else + +#define SKC_META_ADD(meta,prev,blocks,pks,rks) \ + if (blocks & (1<<prev)) \ + meta += 1; \ + if (pks & (1<<prev)) \ + meta += 1<<16; \ + if (rks & (1<<prev)) \ + meta += 1<<24; + +#endif + +#undef HS_SLAB_ROW +#define HS_SLAB_ROW(row,prev) \ + if (diffs & (1<<prev)) { \ + SKC_META_LOCAL_STORE(meta,prev); \ + SKC_META_RESET(meta,row); \ + } \ + SKC_META_ADD(meta,prev, \ + valid_blocks, \ + valid_pks, \ + valid); + + HS_SLAB_ROWS(); + + // + // ATOMICALLY ADD THE CARRIED OUT METAS + // +#if 0 // BUG + if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0)) + SKC_META_LOCAL_ADD(meta); +#else + if (meta != 0) + SKC_META_LOCAL_ADD(meta); +#endif + + // + // NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE + // + + // convert the slab offset to an extent offset + bool const is_offset = (get_sub_group_local_id() & 3) == 1; + uint const adjust = is_offset ? gmem_base - 1 : 0; + + // + // only process the meta components found in this slab + // + uint const cc_min = c_min * SKC_META_COMPONENTS; + uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1; + uint cc = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id(); + + if ((cc >= cc_min) && (cc <= cc_max)) + { + uint const c = shared.c[cc]; + + if (c != 0) + atomic_add(metas+cc,c+adjust); + } + + cc += HS_LANES_PER_WARP; + + for (; cc<=cc_max; cc+=HS_LANES_PER_WARP) + { + uint const c = shared.c[cc]; + + if (c != 0) + atomic_add(metas+cc,c+adjust); + } +} + +// +// +// diff --git a/src/compute/skc/platforms/cl_12/runtime_cl.c b/src/compute/skc/platforms/cl_12/runtime_cl.c deleted file mode 100644 index a745ed013e..0000000000 --- a/src/compute/skc/platforms/cl_12/runtime_cl.c +++ /dev/null @@ -1,362 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -// -// -// - -#include <stdio.h> -#include <string.h> -#include <stdlib.h> -#include <assert.h> - -// -// -// - -#include "runtime_cl.h" -#include "common/cl/assert_cl.h" - -// -// -// - -static is_verbose = true; - -// -// FIXME -- all variable length device queries need to start querying -// the parameter's return size before getting its value -// -// FIXME -- this is now handled by the common/cl/find.* routine -// - -union skc_cl_device_version { - struct { - cl_uchar opencl_space[7]; // "OpenCL_" - cl_uchar major; - cl_uchar dot; - cl_uchar minor; -#if 1 // Intel NEO requires at least 16 bytes - cl_uchar space; - cl_uchar vendor[32]; -#endif - }; - struct { - cl_uchar aN[]; - }; -}; - -typedef cl_bitfield cl_diagnostic_verbose_level_intel; - -#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106 -#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL 0x2 -#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL 0x1 -#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL 0x4 - -static -void -CL_CALLBACK -skc_context_callback(char const * error, void const * info, size_t size, void * user) -{ - if (info != NULL ) - { - fprintf(stderr,"%s\n",error); - } -} - -// -// -// - -skc_err -skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl, - char const * const target_platform_substring, - char const * const target_device_substring, - cl_context_properties context_properties[]) -{ - skc_err err = SKC_ERR_SUCCESS; - - // - // search available devices for a match - // -#define PLATFORM_IDS_MAX 16 -#define DEVICE_IDS_MAX 16 -#define PLATFORM_NAME_SIZE_MAX 64 -#define DEVICE_NAME_SIZE_MAX 64 -#define DRIVER_VERSION_SIZE_MAX 64 - - cl_int cl_err; - - cl_platform_id platform_ids[PLATFORM_IDS_MAX]; - cl_device_id device_ids [PLATFORM_IDS_MAX][DEVICE_IDS_MAX]; - - cl_uint platform_count; - cl_uint device_count[PLATFORM_IDS_MAX]; - - cl_uint platform_idx = UINT32_MAX, device_idx = UINT32_MAX; - - bool match = false; // find _first_ match - - // - // get number of platforms - // - cl(GetPlatformIDs(PLATFORM_IDS_MAX,platform_ids,&platform_count)); - - // - // search platforms - // - for (cl_uint ii=0; ii<platform_count; ii++) - { - char platform_name[PLATFORM_NAME_SIZE_MAX]; - - cl(GetPlatformInfo(platform_ids[ii], - CL_PLATFORM_NAME, - sizeof(platform_name), - platform_name, - NULL)); - - if (!match && (strstr(platform_name,target_platform_substring) != NULL)) - { - platform_idx = ii; - } - - if (is_verbose) { - fprintf(stdout,"%2u: %s\n",ii,platform_name); - } - - cl_err = clGetDeviceIDs(platform_ids[ii], - CL_DEVICE_TYPE_ALL, - DEVICE_IDS_MAX, - device_ids[ii], - device_count+ii); - - if (cl_err != CL_DEVICE_NOT_FOUND) - cl_ok(cl_err); - - for (cl_uint jj=0; jj<device_count[ii]; jj++) - { - char device_name[DEVICE_NAME_SIZE_MAX]; - union skc_cl_device_version device_version; - cl_uint device_align_bits; - char driver_version[DRIVER_VERSION_SIZE_MAX]; - - cl(GetDeviceInfo(device_ids[ii][jj], - CL_DEVICE_NAME, - sizeof(device_name), - device_name, - NULL)); - - // FIXME -- some of these variable length parameters should - // use the "size the param before reading" idiom - cl(GetDeviceInfo(device_ids[ii][jj], - CL_DEVICE_VERSION, - sizeof(device_version), - device_version.aN, - NULL)); - - cl(GetDeviceInfo(device_ids[ii][jj], - CL_DEVICE_MEM_BASE_ADDR_ALIGN, - sizeof(device_align_bits), - &device_align_bits, - NULL)); - - cl_uint const base_align = device_align_bits / 8; // bytes - - cl(GetDeviceInfo(device_ids[ii][jj], - CL_DRIVER_VERSION, - sizeof(driver_version), - driver_version, - NULL)); - - if (!match && (platform_idx == ii) && (strstr(device_name,target_device_substring) != NULL)) - { - match = true; - device_idx = jj; - - runtime_cl->version.major = device_version.major - 48; - runtime_cl->version.minor = device_version.minor - 48; - runtime_cl->base_align = base_align; - - if (is_verbose) { - fprintf(stdout," >>>"); - } - } - else if (is_verbose) - { - fprintf(stdout," "); - } - - if (is_verbose) { - fprintf(stdout, - " %1u: %s [ %s ] [ %s ] [ %u ]\n", - jj, - device_name, - device_version.aN, - driver_version, - base_align); - } - } - } - - if (is_verbose) { - fprintf(stdout,"\n"); - } - - // - // get target platform and device - // - if (platform_idx >= platform_count) - { - fprintf(stderr,"no match for target platform substring %s\n",target_platform_substring); - exit(EXIT_FAILURE); - } - if (device_idx >= device_count[platform_idx]) - { - fprintf(stderr,"no match for target device substring %s\n",target_device_substring); - exit(EXIT_FAILURE); - } - - runtime_cl->platform_id = platform_ids[platform_idx]; - runtime_cl->device_id = device_ids [platform_idx][device_idx]; - - // - // create context - // - -#if 0 - cl_context_properties context_properties[] = - { - CL_CONTEXT_PLATFORM,(cl_context_properties)runtime_cl->platform_id, - 0 - }; -#else - context_properties[1] = (cl_context_properties)runtime_cl->platform_id; -#endif - - runtime_cl->context = clCreateContext(context_properties, - 1, - &runtime_cl->device_id, - skc_context_callback, - NULL, - &cl_err); - cl_ok(cl_err); - - // - // get device name, driver version, and unified memory flag - // - if (is_verbose) - { - char device_name[DEVICE_NAME_SIZE_MAX]; - char driver_version[DRIVER_VERSION_SIZE_MAX]; - cl_bool device_is_unified; - cl_device_svm_capabilities svm_caps; - size_t printf_buffer_size; - - cl(GetDeviceInfo(runtime_cl->device_id, - CL_DEVICE_NAME, - sizeof(device_name), - device_name, - NULL)); - - cl(GetDeviceInfo(runtime_cl->device_id, - CL_DRIVER_VERSION, - sizeof(driver_version), - driver_version, - NULL)); - - cl(GetDeviceInfo(runtime_cl->device_id, - CL_DEVICE_HOST_UNIFIED_MEMORY, - sizeof(device_is_unified), - &device_is_unified, - NULL)); - - cl(GetDeviceInfo(runtime_cl->device_id, - CL_DEVICE_SVM_CAPABILITIES, - sizeof(svm_caps), - &svm_caps, - 0)); - - cl(GetDeviceInfo(runtime_cl->device_id, - CL_DEVICE_PRINTF_BUFFER_SIZE, - sizeof(printf_buffer_size), - &printf_buffer_size, - NULL)); - - fprintf(stderr, - "CL_DEVICE_SVM_COARSE_GRAIN_BUFFER %c\n" - "CL_DEVICE_SVM_FINE_GRAIN_BUFFER %c\n" - "CL_DEVICE_SVM_FINE_GRAIN_SYSTEM %c\n" - "CL_DEVICE_SVM_ATOMICS %c\n" - "CL_DEVICE_PRINTF_BUFFER_SIZE %zu\n\n", - svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? '*' : '-', - svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? '*' : '-', - svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? '*' : '-', - svm_caps & CL_DEVICE_SVM_ATOMICS ? '*' : '-', - printf_buffer_size); - } - - return err; -} - -// -// -// - -skc_err -skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl) -{ - // FIXME - printf("%s incomplete!\n",__func__); - - return SKC_ERR_SUCCESS; -} - -// -// -// - -cl_command_queue -skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type) -{ - cl_command_queue cq; - - if (runtime_cl->version.major < 2) - { - // - // <= OpenCL 1.2 - // - cl_int cl_err; - - cq = clCreateCommandQueue(runtime_cl->context, - runtime_cl->device_id, - (cl_command_queue_properties)type, - &cl_err); cl_ok(cl_err); - } - else - { - // - // >= OpenCL 2.0 - // - cl_int cl_err; - cl_queue_properties const queue_properties[] = { - CL_QUEUE_PROPERTIES,(cl_queue_properties)type,0 - }; - - cq = clCreateCommandQueueWithProperties(runtime_cl->context, - runtime_cl->device_id, - queue_properties, - &cl_err); cl_ok(cl_err); - } - - return cq; -} - -// -// -// - diff --git a/src/compute/skc/platforms/cl_12/runtime_cl.h b/src/compute/skc/platforms/cl_12/runtime_cl.h deleted file mode 100644 index 9e58ca0cc7..0000000000 --- a/src/compute/skc/platforms/cl_12/runtime_cl.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright 2017 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can - * be found in the LICENSE file. - * - */ - -#pragma once - -// -// squelch OpenCL 1.2 deprecation warning -// - -#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS -#define CL_USE_DEPRECATED_OPENCL_1_2_APIS -#endif - -#include <CL/opencl.h> - -// -// -// - -#include "skc.h" - -// -// Minimal OpenCL state needed by the runtime to get started -// - -struct skc_runtime_cl -{ - cl_platform_id platform_id; - cl_device_id device_id; - cl_context context; - - struct { - cl_uint major; - cl_uint minor; - } version; // sometimes we need to know this at runtime - - cl_uint base_align; // base address alignment for subbuffer origins -}; - -// -// -// - -typedef enum skc_cq_type_e { - SKC_CQ_TYPE_IN_ORDER = 0, - SKC_CQ_TYPE_OUT_OF_ORDER = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, - SKC_CQ_TYPE_IN_ORDER_PROFILING = (SKC_CQ_TYPE_IN_ORDER | CL_QUEUE_PROFILING_ENABLE), - SKC_CQ_TYPE_OUT_OF_ORDER_PROFILING = (SKC_CQ_TYPE_OUT_OF_ORDER | CL_QUEUE_PROFILING_ENABLE), -} skc_cq_type_e; - -// -// safely creates a generic OpenCL target in very few lines -// - -skc_err -skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl, - char const * const target_platform_substring, - char const * const target_device_substring, - cl_context_properties context_properties[]); - -skc_err -skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl); - -// -// create a command queue with the non-deprecated function -// - -cl_command_queue -skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type); - -// -// -// - diff --git a/src/compute/skc/platforms/cl_12/runtime_cl_12.c b/src/compute/skc/platforms/cl_12/runtime_cl_12.c index fca13edbbd..a4a578fa29 100644 --- a/src/compute/skc/platforms/cl_12/runtime_cl_12.c +++ b/src/compute/skc/platforms/cl_12/runtime_cl_12.c @@ -24,7 +24,6 @@ #include "grid.h" #include "common/cl/assert_cl.h" #include "config_cl.h" -#include "runtime_cl.h" #include "runtime_cl_12.h" #include "export_cl_12.h" @@ -32,7 +31,7 @@ // // -static +static void skc_block_pool_create(struct skc_runtime * const runtime, cl_command_queue cq) { @@ -42,7 +41,7 @@ skc_block_pool_create(struct skc_runtime * const runtime, cl_command_queue cq) // create block extent skc_extent_pdrw_alloc(runtime, &runtime->block_pool.blocks, - runtime->block_pool.size->pool_size * + runtime->block_pool.size->pool_size * runtime->config->block.bytes); // allocate block pool ids @@ -85,7 +84,7 @@ skc_block_pool_create(struct skc_runtime * const runtime, cl_command_queue cq) cl(ReleaseKernel(k1)); } -static +static void skc_block_pool_dispose(struct skc_runtime * const runtime) { @@ -106,7 +105,7 @@ skc_runtime_yield(struct skc_runtime * const runtime) } static -void +void skc_runtime_wait(struct skc_runtime * const runtime) { skc_scheduler_wait(runtime->scheduler); @@ -118,18 +117,26 @@ skc_runtime_wait(struct skc_runtime * const runtime) skc_err skc_runtime_cl_12_create(struct skc_context * const context, - char const * const target_platform_substring, - char const * const target_device_substring, - cl_context_properties context_properties[]) + cl_context context_cl, + cl_device_id device_id_cl) { // allocate the runtime struct skc_runtime * const runtime = malloc(sizeof(*runtime)); - // acquire OpenCL ids and context for target device - skc_err err = skc_runtime_cl_create(&runtime->cl, - target_platform_substring, - target_device_substring, - context_properties); + // save off CL objects + runtime->cl.context = context_cl; + runtime->cl.device_id = device_id_cl; + + // query device alignment + cl_uint align_bits; + + cl(GetDeviceInfo(device_id_cl, + CL_DEVICE_MEM_BASE_ADDR_ALIGN, + sizeof(align_bits), + &align_bits, + NULL)); + + runtime->cl.align_bytes = align_bits / 8; // create device skc_device_create(runtime); @@ -149,7 +156,7 @@ skc_runtime_cl_12_create(struct skc_context * const context, // initialize cq pool skc_cq_pool_create(runtime, &runtime->cq_pool, - runtime->config->cq_pool.type, + runtime->config->cq_pool.cq_props, runtime->config->cq_pool.size); // acquire in-order cq @@ -176,7 +183,7 @@ skc_runtime_cl_12_create(struct skc_context * const context, context->yield = skc_runtime_yield; context->wait = skc_runtime_wait; - + context->path_builder = skc_path_builder_cl_12_create; context->path_retain = skc_runtime_path_host_retain; context->path_release = skc_runtime_path_host_release; @@ -189,7 +196,7 @@ skc_runtime_cl_12_create(struct skc_context * const context, context->composition = skc_composition_cl_12_create; context->styling = skc_styling_cl_12_create; - + context->surface = skc_surface_cl_12_create; // block on pool creation @@ -198,7 +205,7 @@ skc_runtime_cl_12_create(struct skc_context * const context, // dispose of in-order cq skc_runtime_release_cq_in_order(runtime,cq); - return err; + return SKC_ERR_SUCCESS; }; // @@ -227,7 +234,7 @@ skc_runtime_cl_12_dispose(struct skc_context * const context) skc_block_pool_dispose(context->runtime); // skc_handle_pool_dispose(context->runtime); - + return SKC_ERR_SUCCESS; } @@ -253,12 +260,12 @@ skc_runtime_cl_12_debug(struct skc_context * const context) return; QueryPerformanceCounter(&EndingTime); - + LARGE_INTEGER ElapsedMicroseconds, Frequency; ElapsedMicroseconds.QuadPart = EndingTime.QuadPart - StartingTime.QuadPart; - QueryPerformanceFrequency(&Frequency); + QueryPerformanceFrequency(&Frequency); double const msecs_total = 1000.0 * ElapsedMicroseconds.QuadPart / Frequency.QuadPart; double const msecs_frame = msecs_total / SKC_FRAMES; @@ -268,7 +275,7 @@ skc_runtime_cl_12_debug(struct skc_context * const context) #endif struct skc_runtime * const runtime = context->runtime; - + // acquire out-of-order cq cl_command_queue cq = skc_runtime_acquire_cq_in_order(runtime); @@ -311,4 +318,3 @@ skc_runtime_cl_12_debug(struct skc_context * const context) // // // - diff --git a/src/compute/skc/platforms/cl_12/runtime_cl_12.h b/src/compute/skc/platforms/cl_12/runtime_cl_12.h index 7e7ffcb284..ff820e6872 100644 --- a/src/compute/skc/platforms/cl_12/runtime_cl_12.h +++ b/src/compute/skc/platforms/cl_12/runtime_cl_12.h @@ -12,8 +12,8 @@ // // +#include "skc.h" #include "runtime.h" -#include "runtime_cl.h" #include "cq_pool_cl.h" #include "handle_pool_cl_12.h" #include "block_pool_cl_12.h" @@ -31,7 +31,11 @@ struct skc_runtime // // state visible to device // - struct skc_runtime_cl cl; + struct { + cl_context context; + cl_device_id device_id; + cl_uint align_bytes; + } cl; struct { struct skc_allocator_host host; @@ -63,9 +67,8 @@ struct skc_runtime skc_err skc_runtime_cl_12_create(struct skc_context * const context, - char const * const target_platform_substring, - char const * const target_device_substring, - cl_context_properties context_properties[]); + cl_context context_cl, + cl_device_id device_id_cl); skc_err skc_runtime_cl_12_dispose(struct skc_context * const context); diff --git a/src/compute/skc/raster_builder.c b/src/compute/skc/raster_builder.c index 6da8071e61..a0f1fcfdb0 100644 --- a/src/compute/skc/raster_builder.c +++ b/src/compute/skc/raster_builder.c @@ -69,7 +69,7 @@ float const skc_transform_identity[8] = 0.0f, 0.0f // w0 w1 1 <-- always 1 }; -float const * const skc_transform_identity_ptr = skc_transform_identity; +// float const * const skc_transform_identity_ptr = skc_transform_identity; // // DEFAULT RASTER CLIP @@ -82,7 +82,7 @@ float const skc_raster_clip_default[4] = +FLT_MAX, +FLT_MAX // upper right corner of bounding box }; -float const * const skc_raster_clip_default_ptr = skc_raster_clip_default; +// float const * const skc_raster_clip_default_ptr = skc_raster_clip_default; #endif diff --git a/src/compute/skc/skc.h b/src/compute/skc/skc.h index e46b6a9d25..a81a5346b7 100644 --- a/src/compute/skc/skc.h +++ b/src/compute/skc/skc.h @@ -10,125 +10,18 @@ #define SKC_ONCE_SKC // -// FIXME -- get rid of these here -// - -#include <stdint.h> -#include <stdbool.h> - -// -// -// - -#include "skc_styling.h" // FIXME -- skc_styling -// #include "skc_err.h" - -// -// FIXME -- move errors to an skc prefixed include -// - -typedef enum skc_err { - - SKC_ERR_SUCCESS = 0, - - SKC_ERR_API_BASE = 10000, - - SKC_ERR_NOT_IMPLEMENTED = SKC_ERR_API_BASE, - - SKC_ERR_POOL_EMPTY, - - SKC_ERR_CONDVAR_WAIT, - - SKC_ERR_LAYER_ID_INVALID, - SKC_ERR_LAYER_NOT_EMPTY, - - SKC_ERR_TRANSFORM_WEAKREF_INVALID, - SKC_ERR_STROKE_STYLE_WEAKREF_INVALID, - - SKC_ERR_COMMAND_NOT_READY, - SKC_ERR_COMMAND_NOT_COMPLETED, - SKC_ERR_COMMAND_NOT_STARTED, - - SKC_ERR_COMMAND_NOT_READY_OR_COMPLETED, - - SKC_ERR_COMPOSITION_SEALED, - SKC_ERR_STYLING_SEALED, - - SKC_ERR_HANDLE_INVALID, - SKC_ERR_HANDLE_OVERFLOW, - - SKC_ERR_COUNT - -} skc_err; - -// -// SPINEL TYPES -// - -typedef struct skc_context * skc_context_t; -typedef struct skc_path_builder * skc_path_builder_t; -typedef struct skc_raster_builder * skc_raster_builder_t; - -typedef struct skc_composition * skc_composition_t; -typedef struct skc_styling * skc_styling_t; - -typedef struct skc_surface * skc_surface_t; - -#if 0 -typedef struct skc_interop * skc_interop_t; -typedef uint32_t skc_interop_surface_t; -#endif - -typedef uint32_t skc_path_t; -typedef uint32_t skc_raster_t; - -typedef uint32_t skc_layer_id; -typedef uint32_t skc_group_id; - -typedef uint32_t skc_styling_cmd_t; - -typedef uint64_t skc_weakref_t; -typedef skc_weakref_t skc_transform_weakref_t; -typedef skc_weakref_t skc_raster_clip_weakref_t; - -// -// FIXME -- bury all of this -// - -#define SKC_STYLING_CMDS(...) _countof(__VA_ARGS__),__VA_ARGS__ -#define SKC_GROUP_IDS(...) _countof(__VA_ARGS__),__VA_ARGS__ - -// // // -#define SKC_PATH_INVALID UINT32_MAX -#define SKC_RASTER_INVALID UINT32_MAX -#define SKC_WEAKREF_INVALID UINT64_MAX - -// -// TRANSFORM LAYOUT: { sx shx tx shy sy ty w0 w1 } -// - -extern float const * const skc_transform_identity_ptr; // { 1, 0, 0, 0, 1, 0, 0, 0 } - -// -// RASTER CLIP LAYOUT: { x0, y0, x1, y1 } -// - -extern float const * const skc_raster_clip_default_ptr; +#include "skc_err.h" +#include "skc_types.h" +#include "skc_styling.h" // // CONTEXT // skc_err -skc_context_create(skc_context_t * context, - char const * target_platform_substring, - char const * target_device_substring, - intptr_t context_properties[]); - -skc_err skc_context_retain(skc_context_t context); skc_err @@ -138,31 +31,6 @@ skc_err skc_context_reset(skc_context_t context); // -// COORDINATED EXTERNAL OPERATIONS -// - -/* - Examples include: - - - Transforming an intermediate layer with a blur, sharpen, rotation or scaling kernel. - - Subpixel antialiasing using neighboring pixel color and coverage data. - - Performing a blit from one region to another region on a surface. - - Blitting from one surface to another. - - Loading and processing from one region and storing to another region. - - Rendezvousing with an external pipeline. -*/ - -// -// -// - -bool -skc_context_yield(skc_context_t context); - -void -skc_context_wait(skc_context_t context); - -// // PATH BUILDER // @@ -486,6 +354,31 @@ skc_surface_render(skc_surface_t surface, void * fb); // FIXME FIXME // +// COORDINATED EXTERNAL OPERATIONS +// +// Examples include: +// +// - Transforming an intermediate layer with a blur, sharpen, rotation or scaling kernel. +// - Subpixel antialiasing using neighboring pixel color and coverage data. +// - Performing a blit from one region to another region on a surface. +// - Blitting from one surface to another. +// - Loading and processing from one region and storing to another region. +// - Rendezvousing with an external pipeline. +// + +// FORTHCOMING... + +// +// SCHEDULER +// + +bool +skc_context_yield(skc_context_t context); + +void +skc_context_wait(skc_context_t context); + +// // // diff --git a/src/compute/skc/skc_create_cl.h b/src/compute/skc/skc_create_cl.h new file mode 100644 index 0000000000..0ab0fe0cb9 --- /dev/null +++ b/src/compute/skc/skc_create_cl.h @@ -0,0 +1,70 @@ +/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#ifndef SKC_ONCE_SKC_CREATE_CL +#define SKC_ONCE_SKC_CREATE_CL + +// +// +// + +#ifdef __APPLE__ +#include "OpenCL/opencl.h" +#else +#include "CL/opencl.h" +#endif + +// +// +// + +#include "skc.h" + +// +// CONTEXT CREATION +// + +skc_err +skc_context_create_cl(skc_context_t * context, + cl_context context_cl, + cl_device_id device_id_cl); + +// +// FIXME -- SPECIALIZE SURFACE RENDER +// + +#if 0 + +// +// SURFACE RENDER +// + +typedef void (*skc_surface_render_pfn_notify)(skc_surface_t surface, + skc_styling_t styling, + skc_composition_t composition, + void * data); +skc_err +skc_surface_render(skc_surface_t surface, + uint32_t const clip[4], + skc_styling_t styling, + skc_composition_t composition, + skc_surface_render_pfn_notify notify, + void * data, + void * fb); // FIXME FIXME + +#endif + +// +// +// + +#endif + +// +// +// diff --git a/src/compute/skc/skc_err.h b/src/compute/skc/skc_err.h new file mode 100644 index 0000000000..6587e7d266 --- /dev/null +++ b/src/compute/skc/skc_err.h @@ -0,0 +1,58 @@ +/* + * Copyright 2018 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#ifndef SKC_ONCE_SKC_ERR +#define SKC_ONCE_SKC_ERR + +// +// +// + +typedef enum skc_err { + + SKC_ERR_SUCCESS = 0, + + SKC_ERR_API_BASE = 10000, + + SKC_ERR_NOT_IMPLEMENTED = SKC_ERR_API_BASE, + + SKC_ERR_POOL_EMPTY, + + SKC_ERR_CONDVAR_WAIT, + + SKC_ERR_LAYER_ID_INVALID, + SKC_ERR_LAYER_NOT_EMPTY, + + SKC_ERR_TRANSFORM_WEAKREF_INVALID, + SKC_ERR_STROKE_STYLE_WEAKREF_INVALID, + + SKC_ERR_COMMAND_NOT_READY, + SKC_ERR_COMMAND_NOT_COMPLETED, + SKC_ERR_COMMAND_NOT_STARTED, + + SKC_ERR_COMMAND_NOT_READY_OR_COMPLETED, + + SKC_ERR_COMPOSITION_SEALED, + SKC_ERR_STYLING_SEALED, + + SKC_ERR_HANDLE_INVALID, + SKC_ERR_HANDLE_OVERFLOW, + + SKC_ERR_COUNT + +} skc_err; + +// +// +// + +#endif + +// +// +// diff --git a/src/compute/skc/skc_styling.h b/src/compute/skc/skc_styling.h index 73cc4fc516..62b9e14067 100644 --- a/src/compute/skc/skc_styling.h +++ b/src/compute/skc/skc_styling.h @@ -80,6 +80,13 @@ typedef enum skc_styling_gradient_type_e { } skc_styling_gradient_type_e; // +// FIXME -- bury all of this once we stabilize styling +// + +#define SKC_STYLING_CMDS(...) _countof(__VA_ARGS__),__VA_ARGS__ +#define SKC_GROUP_IDS(...) _countof(__VA_ARGS__),__VA_ARGS__ + +// // // diff --git a/src/compute/skc/skc_types.h b/src/compute/skc/skc_types.h new file mode 100644 index 0000000000..0dbcf182bf --- /dev/null +++ b/src/compute/skc/skc_types.h @@ -0,0 +1,73 @@ +/* + * Copyright 2018 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +#ifndef SKC_ONCE_SKC_TYPES +#define SKC_ONCE_SKC_TYPES + +// +// +// + +#include <stdint.h> +#include <stdbool.h> + +// +// +// + +typedef struct skc_context * skc_context_t; +typedef struct skc_path_builder * skc_path_builder_t; +typedef struct skc_raster_builder * skc_raster_builder_t; + +typedef struct skc_composition * skc_composition_t; +typedef struct skc_styling * skc_styling_t; + +typedef struct skc_surface * skc_surface_t; + +typedef uint32_t skc_path_t; +typedef uint32_t skc_raster_t; + +typedef uint32_t skc_layer_id; +typedef uint32_t skc_group_id; + +typedef uint32_t skc_styling_cmd_t; + +typedef uint64_t skc_weakref_t; +typedef skc_weakref_t skc_transform_weakref_t; +typedef skc_weakref_t skc_raster_clip_weakref_t; + +#if 0 +typedef struct skc_interop * skc_interop_t; +typedef uint32_t skc_interop_surface_t; +#endif + +// +// +// + +#define SKC_PATH_INVALID UINT32_MAX +#define SKC_RASTER_INVALID UINT32_MAX +#define SKC_WEAKREF_INVALID UINT64_MAX + +// +// TRANSFORM LAYOUT: { sx shx tx shy sy ty w0 w1 } +// + +// +// RASTER CLIP LAYOUT: { x0, y0, x1, y1 } +// + +// +// +// + +#endif + +// +// +// |