aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/compute/skc/platforms/cl_12/kernels/place.cl
diff options
context:
space:
mode:
Diffstat (limited to 'src/compute/skc/platforms/cl_12/kernels/place.cl')
-rw-r--r--src/compute/skc/platforms/cl_12/kernels/place.cl1742
1 files changed, 871 insertions, 871 deletions
diff --git a/src/compute/skc/platforms/cl_12/kernels/place.cl b/src/compute/skc/platforms/cl_12/kernels/place.cl
index 92fa0a243d..8866bdb3e6 100644
--- a/src/compute/skc/platforms/cl_12/kernels/place.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/place.cl
@@ -1,871 +1,871 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "tile.h"
-#include "common.h"
-#include "raster.h"
-#include "atomic_cl.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define SKC_PLACE_SUBGROUP_MASK (SKC_PLACE_SUBGROUP_SIZE - 1)
-#define SKC_PLACE_SUBGROUP_LAST (SKC_PLACE_SUBGROUP_SIZE - 1)
-
-//
-//
-//
-
-#define SKC_PLACE_SMEM_COUNT_TTSK SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_SMEM_COUNT_TTPK SKC_RASTER_NODE_MAX_TTPK
-
-//
-//
-//
-
-#define SKC_PLACE_X (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE)
-
-//
-//
-//
-
-#if ( SKC_PLACE_X == 1 )
-#define SKC_PLACE_EXPAND() SKC_EXPAND_1()
-#define SKC_PLACE_EXPAND_I_LAST 0
-
-#elif ( SKC_PLACE_X == 2 )
-#define SKC_PLACE_EXPAND() SKC_EXPAND_2()
-#define SKC_PLACE_EXPAND_I_LAST 1
-
-#elif ( SKC_PLACE_X == 4 )
-#define SKC_PLACE_EXPAND() SKC_EXPAND_4()
-#define SKC_PLACE_EXPAND_I_LAST 3
-
-#elif ( SKC_PLACE_X == 8 )
-#define SKC_PLACE_EXPAND() SKC_EXPAND_8()
-#define SKC_PLACE_EXPAND_I_LAST 7
-
-#elif ( SKC_PLACE_X == 16)
-#define SKC_PLACE_EXPAND() SKC_EXPAND_16()
-#define SKC_PLACE_EXPAND_I_LAST 15
-#endif
-
-//
-// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE
-// COALESCED WRITES. LO FIRST, FOLLOWED BY HI.
-//
-// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE
-// KERNELS USE DIFFERENT SUBGROUP SIZES.
-//
-// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE
-// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID.
-//
-// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER
-// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY
-// ONLY SUPPORT A SUBGROUP SIZE OF 16.
-//
-
-#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE )
-
-#define SKC_PLACE_STRIDE_H(L) (L)
-#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE)
-
-#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_PLACE_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_RATIO - 1)
-#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I) ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK))
-
-#define SKC_PLACE_STRIDE_H(L) (L)
-#define SKC_PLACE_STRIDE_V_LO(I) (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE)
-
-#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_PLACE_SUBGROUP_RATIO (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
-#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
-
-#define SKC_PLACE_STRIDE_H(L) (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK))
-#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO)
-
-#endif
-
-//
-// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE
-// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8)
-//
-
-#define SKC_PLACE_IS_ALL_HEADER_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS)
-
-#define SKC_PLACE_IS_NOT_HEADER_ROW(i) ( (i) * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS)
-
-#define SKC_PLACE_IS_TRAILING_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS)
-
-#define SKC_PLACE_IS_HEADER_ROW_KEY(i) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
-
-
-//
-// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX
-//
-#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
-#define SKC_PLACE_NODE_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() < (k))
-
-//
-// TTSK v2:
-//
-// 0 63
-// | TTSB ID | PREFIX | SPAN | X | Y |
-// +---------+--------+---------+-----+-----+
-// | 27 | 1 (=0) | 12 (=0) | 12 | 12 |
-//
-//
-// TTPK v2:
-//
-// 0 63
-// | TTPB ID | PREFIX | SPAN | X | Y |
-// +---------+--------+------+-----+-----+
-// | 27 | 1 (=1) | 12 | 12 | 12 |
-//
-//
-
-//
-// TTCK (32-BIT COMPARE) v1:
-//
-// 0 63
-// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
-// +----------------------+--------+--------+-------+-----+-----+
-// | 30 | 1 | 1 | 18 | 7 | 7 |
-//
-//
-// TTCK (32-BIT COMPARE) v2:
-//
-// 0 63
-// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
-// +----------------------+--------+--------+-------+-----+-----+
-// | 30 | 1 | 1 | 15 | 9 | 8 |
-//
-//
-// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
-//
-// 0 63
-// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
-// +----------------------+--------+--------+-------+-----+-----+
-// | 27 | 1 | 1 | 18 | 9 | 8 |
-//
-
-union skc_subgroup_smem
-{
- skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE
-
- struct {
- struct {
- skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
- skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
- } lo;
-
- struct {
- skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
- skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
- } hi;
-
- // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK];
- };
-
-};
-
-//
-// scatter scan max
-//
-static
-skc_int_v_t
-skc_scatter_scan_max(__local union skc_subgroup_smem volatile * const smem,
- skc_int_v_t const iss,
- skc_int_v_t const ess)
-{
- //
- // prefix sums determine which lanes we're going to work on next
- //
- skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE);
- skc_int_v_t const scratch_idx = max(ess,0);
-
- //
- // SIMT
- //
-
- //
- // zero the volatile smem scratchpad using vector syntax
- //
- smem->scratch[get_sub_group_local_id()] = ( 0 );
-
- //
- // store source lane at starting lane
- //
- if (is_scratch_store) {
- smem->scratch[scratch_idx] = get_sub_group_local_id();
- }
-
- //
- // propagate lanes to right using max scan
- //
- skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()];
- skc_int_v_t const source = sub_group_scan_inclusive_max(scratch);
-
- return source;
-}
-
-//
-//
-//
-
-static
-skc_bool
-skc_xk_clip(union skc_tile_clip const * const tile_clip,
- skc_ttxk_t * const xk)
-{
- //
- // clip the sk and pk keys
- //
- // if fully clipped then return false
- //
- // alternatively -- we can expand all these keys in place
- //
- // alternatively -- keep sk and pk keys segregated because sk
- // represents the vast majority of keys and are easier to process.
- // don't mess with the fastpath!
- //
- return false;
-}
-
-//
-//
-//
-
-static
-skc_ttck_t
-skc_sk_to_ck(__local union skc_subgroup_smem volatile * const smem,
- union skc_cmd_place const * const cmd,
- skc_uint const sk_idx)
-{
- skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0
- skc_uint const hi = smem->hi.sk[sk_idx];
-
- skc_ttck_t ck;
-
- ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
-
- // FIXME -- x and y should already be clipped and shifted
- skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
- skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
-
- ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
-
- return ck;
-}
-
-static
-skc_ttck_t
-skc_pk_to_ck(__local union skc_subgroup_smem volatile * const smem,
- union skc_cmd_place const * const cmd,
- skc_uint const pk_idx,
- skc_uint const dx)
-{
- skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1
- skc_uint const hi = smem->hi.pk[pk_idx];
-
- skc_ttck_t ck;
-
- ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
-
- // FIXME -- x and y should already be clipped and shifted
- skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
- skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
-
- ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
-
- return ck;
-}
-
-//
-//
-//
-
-static
-void
-skc_ttsk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics,
- __global skc_ttck_t * const ck_extent,
- __local union skc_subgroup_smem volatile * const smem,
- union skc_cmd_place const * const cmd,
- skc_uint const sk)
-{
- //
- // Pretty sure you can never ever have an sk count equal to 0
- //
- skc_uint ck_base = 0;
-
- // last lane performs the block pool allocation with an atomic increment
- if (get_sub_group_local_id() == 0) {
- ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk);
- }
-
- // broadcast base to all lanes
- ck_base = sub_group_broadcast(ck_base,0);
-
- // convert sk keys to ck keys
- for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE)
- {
- ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii);
- }
-}
-
-//
-//
-//
-
-static
-skc_int
-skc_ttpk_get_span(__local union skc_subgroup_smem volatile * const smem,
- skc_uint const idx)
-{
- skc_uint const lo = smem->lo.pk[idx];
- skc_uint const hi = smem->hi.pk[idx];
-
- skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN;
- skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN;
-
- return (span_lo | span_hi) + 1;
-}
-
-//
-//
-//
-
-static
-void
-skc_ttpk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics,
- __global skc_ttck_t * const ck_extent,
- __local union skc_subgroup_smem volatile * const smem,
- union skc_cmd_place const * const cmd,
- skc_uint const pk)
-{
- // bail out if pk queue is empty
- if (pk == 0)
- return;
-
-#if 0
- if (get_sub_group_local_id() == 0)
- printf("%u\n",pk);
-#endif
-
- //
- // FIXME -- this nested loop iterates over the queue processing a
- // subgroup of 64-bit keys at a time. This is probably not the most
- // efficient approach so investigate how to store and iterate over a
- // wider than subgroup (node-sized) queue of keys.
- //
-
- // round up so we work with full subgroups
- skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK;
- skc_uint ii = 0;
-
- // nested loop that expands all ttpk keys
-#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE)
- for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE)
-#endif
- {
- skc_uint idx = ii + get_sub_group_local_id();
- skc_int span = 0;
-
- // how many tiles does this ttpk span?
- if (idx < pk)
- span = skc_ttpk_get_span(smem,idx);
-
- // we need inclusive, exclusive and total
- skc_int iss = sub_group_scan_inclusive_add(span);
- skc_int ess = iss - span;
- skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1);
-
- // printf("%u : %u\n",span,iss);
- // continue;
-
- // atomically allocate space for the pk keys
- skc_uint ck_base = 0;
-
- // last lane performs the block pool allocation with an atomic increment
- if (get_sub_group_local_id() == 0) {
- ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem);
- }
-
- // broadcast atomically allocated extent base to all lanes
- skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id();
-
- //
- // FIXME -- this loop would probably be faster if the ttpk keys
- // were held in registers and accessed with shuffles instead of
- // SMEM loads
- //
-
- //
- // loop until there are no more expanded pk keys
- //
- while (true)
- {
- skc_int const source = skc_scatter_scan_max(smem,iss,ess);
- skc_int const dx = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source);
-
- // store valid ck keys to gmem
- if (get_sub_group_local_id() < rem) {
- ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx);
- }
-
- // decrement remainder
- rem -= SKC_PLACE_SUBGROUP_SIZE;
-
- if (rem <= 0)
- break;
-
- // increment/decrement indices
- ck_idx += SKC_PLACE_SUBGROUP_SIZE;
- iss -= SKC_PLACE_SUBGROUP_SIZE;
- ess -= SKC_PLACE_SUBGROUP_SIZE;
- }
- }
-}
-
-//
-//
-//
-
-static
-skc_uint
-skc_ballot(skc_uint * const xk, skc_uint const is_xk)
-{
-#if 0
- //
- // FIXME -- when available, this should use the idiom:
- //
- // ballot() + lane_mask_less_than_or_equal + popcount()
- //
- // Supported by:
- //
- // - Vulkan 1.1 / SPIR-V 1.3
- // - CUDA
- // - AVX2 (SSE*?)
- //
-#else
- //
- // otherwise, emulate with an inclusive scan (yuk)
- //
- skc_uint const prefix = sub_group_scan_inclusive_add(is_xk);
-
- skc_uint const xk_idx = *xk + prefix - is_xk;
-
- *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST);
-
-#if 0
- printf("< %3u >\n",xk_idx);
-#endif
-
- return xk_idx;
-#endif
-}
-
-//
-//
-//
-__kernel
-SKC_PLACE_KERNEL_ATTRIBS
-void
-skc_kernel_place(__global skc_bp_elem_t * const bp_elems,
- __global SKC_ATOMIC_UINT volatile * const place_atomics,
- __global skc_ttck_t * const ck_extent,
- __global union skc_cmd_place const * const cmds,
- __global skc_block_id_t * const map,
- skc_uint4 const clip,
- skc_uint const count)
-{
- //
- // declare shared memory block
- //
-#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
- __local union skc_subgroup_smem volatile smem[1];
-#else
- __local union skc_subgroup_smem volatile smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS];
- __local union skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
-#endif
-
- //
- // This is a subgroup-centric kernel
- //
- // Which subgroup in the grid is this?
- //
- // TAKE NOTE: the Intel GEN compiler appears to be recognizing
- // get_group_id(0) as a uniform but the alternative calculation used
- // when there are multiple subgroups per workgroup is not
- // cooperating and driving spillage elsewhere.
- //
- // Test the raster's translated bounds against the composition's
- // tile clip
- //
- // There are 3 cases:
- //
- // - the raster is completely clipped -> return
- // - the raster is partially clipped -> all keys must clipped
- // - the raster is not clipped -> no keys are tested
- //
- //
- // There are at least 4 implementations of place and we want to
- // special-case them as much as possible so that, at the least, the
- // fastpath remains fast.
- //
- // - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP
- //
- // - implement CLIPPED + NO TRANSLATION path
- //
- // - implement NO CLIP + TRANSLATION path
- //
- // - implement CLIPPED + TRANSLATION path
- //
- //
- // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin
- // 12:12:8 integer where:
- //
- // 12: ttsk
- // 12: ttpk
- // 8: /dev/null -- clipped or invalid key
- //
- // Three kinds of nodes in a raster's list:
- //
- // - the head node
- // - an internal node
- // - the final node
- //
-
-#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
- skc_uint const cmd_idx = get_group_id(0);
-#else
- skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
- // load command
- union skc_cmd_place const cmd = cmds[cmd_idx];
-
- // get the raster header from the raster host id -- scalar
- skc_block_id_t id = map[cmd.raster_h];
-
- //
- // load all of the head block ttxk keys into registers
- //
- // FIXME -- this pattern lends itself to using the higher
- // performance Intel GEN block load instructions
- //
- skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- union skc_raster_node_elem const h##I = { \
- .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)], \
- bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)] } \
- };
-
- SKC_PLACE_EXPAND();
-
- //
- // load raster header counts -- we only need the "nodes" and "keys"
- // words but the keys we loaded are doublewords.
- //
- // FIXME -- this can be made portable with compile-time macro expansion
- //
- skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
- skc_uint keys = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
-
- //
- //
- //
-#if 0
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \
- nodes,keys, \
- I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \
- h##I.u32v2.hi,h##I.u32v2.lo, \
- h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
-
- SKC_PLACE_EXPAND();
-#endif
-
- //
-#if 0
- if (get_sub_group_local_id() == 0) {
- printf("place: %u / %u / %u\n",head_id,nodes,keys);
- }
-#endif
-
- {
- //
- // classify every key in the header
- //
- // keys: 0 is not a key / 1 is a key
- // skpk: 0 is sk / 1 is pk
- //
- skc_uint bits_keys = 0;
- skc_uint bits_skpk = 0;
-
- //
- // calculate bits_keys
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
- skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \
- if (idx < keys) { \
- bits_keys |= (1u << I); \
- } \
- if (SKC_PLACE_IS_TRAILING_ROW(I)) { \
- if (keys > SKC_RASTER_HEAD_COUNT_KEYS) { \
- if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \
- bits_keys &= ~(1u << I); \
- } \
- } \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
- //
- // blindly calculate bits_skpk
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
- bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
- }
-
- SKC_PLACE_EXPAND();
-
-#if 0
- printf("%2X : %2X\n",bits_keys,bits_skpk);
-#endif
-
- //
- // next pointer is last element of last row. save it now because
- // this might be recognized as a subgroup-uniform/scalar.
- //
- id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
-
- //
- // append SK keys first
- //
- skc_uint const bits_sk = bits_keys & ~bits_skpk;
- skc_uint sk = 0;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
- skc_uint is_sk = (bits_sk >> I) & 1; \
- skc_uint sk_idx = skc_ballot(&sk,is_sk); \
- if (is_sk) { \
- smem->lo.sk[sk_idx] = h##I.xk.lo; \
- smem->hi.sk[sk_idx] = h##I.xk.hi; \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
- //
- // append PK keys next
- //
- skc_uint const bits_pk = bits_keys & bits_skpk;
- skc_uint pk = 0;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
- skc_uint is_pk = (bits_pk >> I) & 1; \
- skc_uint pk_idx = skc_ballot(&pk,is_pk); \
- if (is_pk) { \
- smem->lo.pk[pk_idx] = h##I.xk.lo; \
- smem->hi.pk[pk_idx] = h##I.xk.hi; \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
-#if 0
- printf("%2u * %2u\n",sk,pk);
-#endif
- //
- // flush the keys
- //
- skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
- skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
- }
-
- //
- // we're done if there was only a head node
- //
- if (nodes == 0)
- return;
-
- //
- // decrement keys
- //
- keys -= SKC_RASTER_HEAD_COUNT_KEYS;
-
- //
- // otherwise, append keys in trailing nodes to smem
- //
- while (true)
- {
- //
- // load all of the node block ttxk keys into registers
- //
- // FIXME -- this pattern lends itself to using the higher
- // performance Intel GEN block load instructions
- //
- skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- union skc_raster_node_elem const n##I = { \
- .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)], \
- bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)] } \
- };
-
- SKC_PLACE_EXPAND();
-
-#if 0
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \
- nodes,keys, \
- I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \
- n##I.u32v2.hi,n##I.u32v2.lo, \
- n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
-
- SKC_PLACE_EXPAND();
-#endif
-
- //
- // classify every key in the header
- //
- // keys: 0 is not a key / 1 is a key
- // skpk: 0 is sk / 1 is pk
- //
- skc_uint bits_keys = 0;
- skc_uint bits_skpk = 0;
-
- //
- // calculate bits_keys
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) { \
- skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \
- if (idx < keys) { \
- bits_keys |= (1u << I); \
- } \
- if (SKC_PLACE_IS_TRAILING_ROW(I)) { \
- if (keys > SKC_RASTER_NODE_COUNT_KEYS) { \
- if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \
- bits_keys &= ~(1u << I); \
- } \
- } \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
- //
- // blindly calculate bits_skpk
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) { \
- bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
- }
-
- SKC_PLACE_EXPAND();
-
-#if 0
- printf("%2X : %2X\n",bits_keys,bits_skpk);
-#endif
-
- //
- // next pointer is last element of last row. save it now because
- // this might be recognized as a subgroup-uniform/scalar.
- //
- id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
-
- //
- // append SK keys first
- //
- skc_uint const bits_sk = bits_keys & ~bits_skpk;
- skc_uint sk = 0;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) { \
- skc_uint is_sk = (bits_sk >> I) & 1; \
- skc_uint sk_idx = skc_ballot(&sk,is_sk); \
- if (is_sk) { \
- smem->lo.sk[sk_idx] = n##I.xk.lo; \
- smem->hi.sk[sk_idx] = n##I.xk.hi; \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
- //
- // append PK keys next
- //
- skc_uint const bits_pk = bits_keys & bits_skpk;
- skc_uint pk = 0;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) { \
- skc_uint is_pk = (bits_pk >> I) & 1; \
- skc_uint pk_idx = skc_ballot(&pk,is_pk); \
- if (is_pk) { \
- smem->lo.pk[pk_idx] = n##I.xk.lo; \
- smem->hi.pk[pk_idx] = n##I.xk.hi; \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
-#if 0
- printf("%2u * %2u\n",sk,pk);
-#endif
- //
- // if total for either the sk or pk queue reaches the
- // highwater mark then flush it to the extent
- //
- skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
- skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
-
- //
- // if this was the last node then we're done
- //
- if (--nodes == 0)
- return;
-
- //
- // otherwise decrement keys
- //
- keys -= SKC_RASTER_NODE_COUNT_KEYS;
- }
-}
-
-//
-//
-//
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "common.h"
+#include "raster.h"
+#include "atomic_cl.h"
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#define SKC_PLACE_SUBGROUP_MASK (SKC_PLACE_SUBGROUP_SIZE - 1)
+#define SKC_PLACE_SUBGROUP_LAST (SKC_PLACE_SUBGROUP_SIZE - 1)
+
+//
+//
+//
+
+#define SKC_PLACE_SMEM_COUNT_TTSK SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_SMEM_COUNT_TTPK SKC_RASTER_NODE_MAX_TTPK
+
+//
+//
+//
+
+#define SKC_PLACE_X (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE)
+
+//
+//
+//
+
+#if ( SKC_PLACE_X == 1 )
+#define SKC_PLACE_EXPAND() SKC_EXPAND_1()
+#define SKC_PLACE_EXPAND_I_LAST 0
+
+#elif ( SKC_PLACE_X == 2 )
+#define SKC_PLACE_EXPAND() SKC_EXPAND_2()
+#define SKC_PLACE_EXPAND_I_LAST 1
+
+#elif ( SKC_PLACE_X == 4 )
+#define SKC_PLACE_EXPAND() SKC_EXPAND_4()
+#define SKC_PLACE_EXPAND_I_LAST 3
+
+#elif ( SKC_PLACE_X == 8 )
+#define SKC_PLACE_EXPAND() SKC_EXPAND_8()
+#define SKC_PLACE_EXPAND_I_LAST 7
+
+#elif ( SKC_PLACE_X == 16)
+#define SKC_PLACE_EXPAND() SKC_EXPAND_16()
+#define SKC_PLACE_EXPAND_I_LAST 15
+#endif
+
+//
+// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE
+// COALESCED WRITES. LO FIRST, FOLLOWED BY HI.
+//
+// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE
+// KERNELS USE DIFFERENT SUBGROUP SIZES.
+//
+// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE
+// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID.
+//
+// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER
+// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY
+// ONLY SUPPORT A SUBGROUP SIZE OF 16.
+//
+
+#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE )
+
+#define SKC_PLACE_STRIDE_H(L) (L)
+#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE)
+
+#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_PLACE_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_RATIO - 1)
+#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I) ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK))
+
+#define SKC_PLACE_STRIDE_H(L) (L)
+#define SKC_PLACE_STRIDE_V_LO(I) (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE)
+
+#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_PLACE_SUBGROUP_RATIO (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
+#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
+
+#define SKC_PLACE_STRIDE_H(L) (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK))
+#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO)
+
+#endif
+
+//
+// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE
+// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8)
+//
+
+#define SKC_PLACE_IS_ALL_HEADER_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS)
+
+#define SKC_PLACE_IS_NOT_HEADER_ROW(i) ( (i) * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS)
+
+#define SKC_PLACE_IS_TRAILING_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS)
+
+#define SKC_PLACE_IS_HEADER_ROW_KEY(i) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
+
+
+//
+// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX
+//
+#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
+#define SKC_PLACE_NODE_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() < (k))
+
+//
+// TTSK v2:
+//
+// 0 63
+// | TTSB ID | PREFIX | SPAN | X | Y |
+// +---------+--------+---------+-----+-----+
+// | 27 | 1 (=0) | 12 (=0) | 12 | 12 |
+//
+//
+// TTPK v2:
+//
+// 0 63
+// | TTPB ID | PREFIX | SPAN | X | Y |
+// +---------+--------+------+-----+-----+
+// | 27 | 1 (=1) | 12 | 12 | 12 |
+//
+//
+
+//
+// TTCK (32-BIT COMPARE) v1:
+//
+// 0 63
+// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
+// +----------------------+--------+--------+-------+-----+-----+
+// | 30 | 1 | 1 | 18 | 7 | 7 |
+//
+//
+// TTCK (32-BIT COMPARE) v2:
+//
+// 0 63
+// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
+// +----------------------+--------+--------+-------+-----+-----+
+// | 30 | 1 | 1 | 15 | 9 | 8 |
+//
+//
+// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
+//
+// 0 63
+// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
+// +----------------------+--------+--------+-------+-----+-----+
+// | 27 | 1 | 1 | 18 | 9 | 8 |
+//
+
+union skc_subgroup_smem
+{
+ skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE
+
+ struct {
+ struct {
+ skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
+ skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
+ } lo;
+
+ struct {
+ skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
+ skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
+ } hi;
+
+ // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK];
+ };
+
+};
+
+//
+// scatter scan max
+//
+static
+skc_int_v_t
+skc_scatter_scan_max(__local union skc_subgroup_smem volatile * const smem,
+ skc_int_v_t const iss,
+ skc_int_v_t const ess)
+{
+ //
+ // prefix sums determine which lanes we're going to work on next
+ //
+ skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE);
+ skc_int_v_t const scratch_idx = max(ess,0);
+
+ //
+ // SIMT
+ //
+
+ //
+ // zero the volatile smem scratchpad using vector syntax
+ //
+ smem->scratch[get_sub_group_local_id()] = ( 0 );
+
+ //
+ // store source lane at starting lane
+ //
+ if (is_scratch_store) {
+ smem->scratch[scratch_idx] = get_sub_group_local_id();
+ }
+
+ //
+ // propagate lanes to right using max scan
+ //
+ skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()];
+ skc_int_v_t const source = sub_group_scan_inclusive_max(scratch);
+
+ return source;
+}
+
+//
+//
+//
+
+static
+skc_bool
+skc_xk_clip(union skc_tile_clip const * const tile_clip,
+ skc_ttxk_t * const xk)
+{
+ //
+ // clip the sk and pk keys
+ //
+ // if fully clipped then return false
+ //
+ // alternatively -- we can expand all these keys in place
+ //
+ // alternatively -- keep sk and pk keys segregated because sk
+ // represents the vast majority of keys and are easier to process.
+ // don't mess with the fastpath!
+ //
+ return false;
+}
+
+//
+//
+//
+
+static
+skc_ttck_t
+skc_sk_to_ck(__local union skc_subgroup_smem volatile * const smem,
+ union skc_cmd_place const * const cmd,
+ skc_uint const sk_idx)
+{
+ skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0
+ skc_uint const hi = smem->hi.sk[sk_idx];
+
+ skc_ttck_t ck;
+
+ ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
+
+ // FIXME -- x and y should already be clipped and shifted
+ skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
+ skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
+
+ ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
+
+ return ck;
+}
+
+static
+skc_ttck_t
+skc_pk_to_ck(__local union skc_subgroup_smem volatile * const smem,
+ union skc_cmd_place const * const cmd,
+ skc_uint const pk_idx,
+ skc_uint const dx)
+{
+ skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1
+ skc_uint const hi = smem->hi.pk[pk_idx];
+
+ skc_ttck_t ck;
+
+ ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
+
+ // FIXME -- x and y should already be clipped and shifted
+ skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
+ skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
+
+ ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
+
+ return ck;
+}
+
+//
+//
+//
+
+static
+void
+skc_ttsk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics,
+ __global skc_ttck_t * const ck_extent,
+ __local union skc_subgroup_smem volatile * const smem,
+ union skc_cmd_place const * const cmd,
+ skc_uint const sk)
+{
+ //
+ // Pretty sure you can never ever have an sk count equal to 0
+ //
+ skc_uint ck_base = 0;
+
+ // last lane performs the block pool allocation with an atomic increment
+ if (get_sub_group_local_id() == 0) {
+ ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk);
+ }
+
+ // broadcast base to all lanes
+ ck_base = sub_group_broadcast(ck_base,0);
+
+ // convert sk keys to ck keys
+ for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE)
+ {
+ ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii);
+ }
+}
+
+//
+//
+//
+
+static
+skc_int
+skc_ttpk_get_span(__local union skc_subgroup_smem volatile * const smem,
+ skc_uint const idx)
+{
+ skc_uint const lo = smem->lo.pk[idx];
+ skc_uint const hi = smem->hi.pk[idx];
+
+ skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN;
+ skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN;
+
+ return (span_lo | span_hi) + 1;
+}
+
+//
+//
+//
+
+static
+void
+skc_ttpk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics,
+ __global skc_ttck_t * const ck_extent,
+ __local union skc_subgroup_smem volatile * const smem,
+ union skc_cmd_place const * const cmd,
+ skc_uint const pk)
+{
+ // bail out if pk queue is empty
+ if (pk == 0)
+ return;
+
+#if 0
+ if (get_sub_group_local_id() == 0)
+ printf("%u\n",pk);
+#endif
+
+ //
+ // FIXME -- this nested loop iterates over the queue processing a
+ // subgroup of 64-bit keys at a time. This is probably not the most
+ // efficient approach so investigate how to store and iterate over a
+ // wider than subgroup (node-sized) queue of keys.
+ //
+
+ // round up so we work with full subgroups
+ skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK;
+ skc_uint ii = 0;
+
+ // nested loop that expands all ttpk keys
+#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE)
+ for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE)
+#endif
+ {
+ skc_uint idx = ii + get_sub_group_local_id();
+ skc_int span = 0;
+
+ // how many tiles does this ttpk span?
+ if (idx < pk)
+ span = skc_ttpk_get_span(smem,idx);
+
+ // we need inclusive, exclusive and total
+ skc_int iss = sub_group_scan_inclusive_add(span);
+ skc_int ess = iss - span;
+ skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1);
+
+ // printf("%u : %u\n",span,iss);
+ // continue;
+
+ // atomically allocate space for the pk keys
+ skc_uint ck_base = 0;
+
+ // last lane performs the block pool allocation with an atomic increment
+ if (get_sub_group_local_id() == 0) {
+ ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem);
+ }
+
+ // broadcast atomically allocated extent base to all lanes
+ skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id();
+
+ //
+ // FIXME -- this loop would probably be faster if the ttpk keys
+ // were held in registers and accessed with shuffles instead of
+ // SMEM loads
+ //
+
+ //
+ // loop until there are no more expanded pk keys
+ //
+ while (true)
+ {
+ skc_int const source = skc_scatter_scan_max(smem,iss,ess);
+ skc_int const dx = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source);
+
+ // store valid ck keys to gmem
+ if (get_sub_group_local_id() < rem) {
+ ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx);
+ }
+
+ // decrement remainder
+ rem -= SKC_PLACE_SUBGROUP_SIZE;
+
+ if (rem <= 0)
+ break;
+
+ // increment/decrement indices
+ ck_idx += SKC_PLACE_SUBGROUP_SIZE;
+ iss -= SKC_PLACE_SUBGROUP_SIZE;
+ ess -= SKC_PLACE_SUBGROUP_SIZE;
+ }
+ }
+}
+
+//
+//
+//
+
+static
+skc_uint
+skc_ballot(skc_uint * const xk, skc_uint const is_xk)
+{
+#if 0
+ //
+ // FIXME -- when available, this should use the idiom:
+ //
+ // ballot() + lane_mask_less_than_or_equal + popcount()
+ //
+ // Supported by:
+ //
+ // - Vulkan 1.1 / SPIR-V 1.3
+ // - CUDA
+ // - AVX2 (SSE*?)
+ //
+#else
+ //
+ // otherwise, emulate with an inclusive scan (yuk)
+ //
+ skc_uint const prefix = sub_group_scan_inclusive_add(is_xk);
+
+ skc_uint const xk_idx = *xk + prefix - is_xk;
+
+ *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST);
+
+#if 0
+ printf("< %3u >\n",xk_idx);
+#endif
+
+ return xk_idx;
+#endif
+}
+
+//
+//
+//
+__kernel
+SKC_PLACE_KERNEL_ATTRIBS
+void
+skc_kernel_place(__global skc_bp_elem_t * const bp_elems,
+ __global SKC_ATOMIC_UINT volatile * const place_atomics,
+ __global skc_ttck_t * const ck_extent,
+ __global union skc_cmd_place const * const cmds,
+ __global skc_block_id_t * const map,
+ skc_uint4 const clip,
+ skc_uint const count)
+{
+ //
+ // declare shared memory block
+ //
+#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
+ __local union skc_subgroup_smem volatile smem[1];
+#else
+ __local union skc_subgroup_smem volatile smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS];
+ __local union skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+
+ //
+ // This is a subgroup-centric kernel
+ //
+ // Which subgroup in the grid is this?
+ //
+ // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+ // get_group_id(0) as a uniform but the alternative calculation used
+ // when there are multiple subgroups per workgroup is not
+ // cooperating and driving spillage elsewhere.
+ //
+ // Test the raster's translated bounds against the composition's
+ // tile clip
+ //
+ // There are 3 cases:
+ //
+ // - the raster is completely clipped -> return
+ // - the raster is partially clipped -> all keys must clipped
+ // - the raster is not clipped -> no keys are tested
+ //
+ //
+ // There are at least 4 implementations of place and we want to
+ // special-case them as much as possible so that, at the least, the
+ // fastpath remains fast.
+ //
+ // - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP
+ //
+ // - implement CLIPPED + NO TRANSLATION path
+ //
+ // - implement NO CLIP + TRANSLATION path
+ //
+ // - implement CLIPPED + TRANSLATION path
+ //
+ //
+ // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin
+ // 12:12:8 integer where:
+ //
+ // 12: ttsk
+ // 12: ttpk
+ // 8: /dev/null -- clipped or invalid key
+ //
+ // Three kinds of nodes in a raster's list:
+ //
+ // - the head node
+ // - an internal node
+ // - the final node
+ //
+
+#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
+ skc_uint const cmd_idx = get_group_id(0);
+#else
+ skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+ // load command
+ union skc_cmd_place const cmd = cmds[cmd_idx];
+
+ // get the raster header from the raster host id -- scalar
+ skc_block_id_t id = map[cmd.raster_h];
+
+ //
+ // load all of the head block ttxk keys into registers
+ //
+ // FIXME -- this pattern lends itself to using the higher
+ // performance Intel GEN block load instructions
+ //
+ skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ union skc_raster_node_elem const h##I = { \
+ .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)], \
+ bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)] } \
+ };
+
+ SKC_PLACE_EXPAND();
+
+ //
+ // load raster header counts -- we only need the "nodes" and "keys"
+ // words but the keys we loaded are doublewords.
+ //
+ // FIXME -- this can be made portable with compile-time macro expansion
+ //
+ skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
+ skc_uint keys = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
+
+ //
+ //
+ //
+#if 0
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \
+ nodes,keys, \
+ I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \
+ h##I.u32v2.hi,h##I.u32v2.lo, \
+ h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
+
+ SKC_PLACE_EXPAND();
+#endif
+
+ //
+#if 0
+ if (get_sub_group_local_id() == 0) {
+ printf("place: %u / %u / %u\n",head_id,nodes,keys);
+ }
+#endif
+
+ {
+ //
+ // classify every key in the header
+ //
+ // keys: 0 is not a key / 1 is a key
+ // skpk: 0 is sk / 1 is pk
+ //
+ skc_uint bits_keys = 0;
+ skc_uint bits_skpk = 0;
+
+ //
+ // calculate bits_keys
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
+ skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \
+ if (idx < keys) { \
+ bits_keys |= (1u << I); \
+ } \
+ if (SKC_PLACE_IS_TRAILING_ROW(I)) { \
+ if (keys > SKC_RASTER_HEAD_COUNT_KEYS) { \
+ if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \
+ bits_keys &= ~(1u << I); \
+ } \
+ } \
+ } \
+ }
+
+ SKC_PLACE_EXPAND();
+
+ //
+ // blindly calculate bits_skpk
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
+ bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
+ }
+
+ SKC_PLACE_EXPAND();
+
+#if 0
+ printf("%2X : %2X\n",bits_keys,bits_skpk);
+#endif
+
+ //
+ // next pointer is last element of last row. save it now because
+ // this might be recognized as a subgroup-uniform/scalar.
+ //
+ id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
+
+ //
+ // append SK keys first
+ //
+ skc_uint const bits_sk = bits_keys & ~bits_skpk;
+ skc_uint sk = 0;
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
+ skc_uint is_sk = (bits_sk >> I) & 1; \
+ skc_uint sk_idx = skc_ballot(&sk,is_sk); \
+ if (is_sk) { \
+ smem->lo.sk[sk_idx] = h##I.xk.lo; \
+ smem->hi.sk[sk_idx] = h##I.xk.hi; \
+ } \
+ }
+
+ SKC_PLACE_EXPAND();
+
+ //
+ // append PK keys next
+ //
+ skc_uint const bits_pk = bits_keys & bits_skpk;
+ skc_uint pk = 0;
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
+ skc_uint is_pk = (bits_pk >> I) & 1; \
+ skc_uint pk_idx = skc_ballot(&pk,is_pk); \
+ if (is_pk) { \
+ smem->lo.pk[pk_idx] = h##I.xk.lo; \
+ smem->hi.pk[pk_idx] = h##I.xk.hi; \
+ } \
+ }
+
+ SKC_PLACE_EXPAND();
+
+#if 0
+ printf("%2u * %2u\n",sk,pk);
+#endif
+ //
+ // flush the keys
+ //
+ skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
+ skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
+ }
+
+ //
+ // we're done if there was only a head node
+ //
+ if (nodes == 0)
+ return;
+
+ //
+ // decrement keys
+ //
+ keys -= SKC_RASTER_HEAD_COUNT_KEYS;
+
+ //
+ // otherwise, append keys in trailing nodes to smem
+ //
+ while (true)
+ {
+ //
+ // load all of the node block ttxk keys into registers
+ //
+ // FIXME -- this pattern lends itself to using the higher
+ // performance Intel GEN block load instructions
+ //
+ skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ union skc_raster_node_elem const n##I = { \
+ .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)], \
+ bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)] } \
+ };
+
+ SKC_PLACE_EXPAND();
+
+#if 0
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \
+ nodes,keys, \
+ I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \
+ n##I.u32v2.hi,n##I.u32v2.lo, \
+ n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
+
+ SKC_PLACE_EXPAND();
+#endif
+
+ //
+ // classify every key in the header
+ //
+ // keys: 0 is not a key / 1 is a key
+ // skpk: 0 is sk / 1 is pk
+ //
+ skc_uint bits_keys = 0;
+ skc_uint bits_skpk = 0;
+
+ //
+ // calculate bits_keys
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) { \
+ skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \
+ if (idx < keys) { \
+ bits_keys |= (1u << I); \
+ } \
+ if (SKC_PLACE_IS_TRAILING_ROW(I)) { \
+ if (keys > SKC_RASTER_NODE_COUNT_KEYS) { \
+ if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \
+ bits_keys &= ~(1u << I); \
+ } \
+ } \
+ } \
+ }
+
+ SKC_PLACE_EXPAND();
+
+ //
+ // blindly calculate bits_skpk
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) { \
+ bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
+ }
+
+ SKC_PLACE_EXPAND();
+
+#if 0
+ printf("%2X : %2X\n",bits_keys,bits_skpk);
+#endif
+
+ //
+ // next pointer is last element of last row. save it now because
+ // this might be recognized as a subgroup-uniform/scalar.
+ //
+ id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
+
+ //
+ // append SK keys first
+ //
+ skc_uint const bits_sk = bits_keys & ~bits_skpk;
+ skc_uint sk = 0;
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) { \
+ skc_uint is_sk = (bits_sk >> I) & 1; \
+ skc_uint sk_idx = skc_ballot(&sk,is_sk); \
+ if (is_sk) { \
+ smem->lo.sk[sk_idx] = n##I.xk.lo; \
+ smem->hi.sk[sk_idx] = n##I.xk.hi; \
+ } \
+ }
+
+ SKC_PLACE_EXPAND();
+
+ //
+ // append PK keys next
+ //
+ skc_uint const bits_pk = bits_keys & bits_skpk;
+ skc_uint pk = 0;
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) { \
+ skc_uint is_pk = (bits_pk >> I) & 1; \
+ skc_uint pk_idx = skc_ballot(&pk,is_pk); \
+ if (is_pk) { \
+ smem->lo.pk[pk_idx] = n##I.xk.lo; \
+ smem->hi.pk[pk_idx] = n##I.xk.hi; \
+ } \
+ }
+
+ SKC_PLACE_EXPAND();
+
+#if 0
+ printf("%2u * %2u\n",sk,pk);
+#endif
+ //
+ // if total for either the sk or pk queue reaches the
+ // highwater mark then flush it to the extent
+ //
+ skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
+ skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
+
+ //
+ // if this was the last node then we're done
+ //
+ if (--nodes == 0)
+ return;
+
+ //
+ // otherwise decrement keys
+ //
+ keys -= SKC_RASTER_NODE_COUNT_KEYS;
+ }
+}
+
+//
+//
+//