diff options
Diffstat (limited to 'src/compute/skc/platforms/cl_12/kernels/place.cl')
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/place.cl | 1742 |
1 files changed, 871 insertions, 871 deletions
diff --git a/src/compute/skc/platforms/cl_12/kernels/place.cl b/src/compute/skc/platforms/cl_12/kernels/place.cl index 92fa0a243d..8866bdb3e6 100644 --- a/src/compute/skc/platforms/cl_12/kernels/place.cl +++ b/src/compute/skc/platforms/cl_12/kernels/place.cl @@ -1,871 +1,871 @@ -/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "tile.h"
-#include "common.h"
-#include "raster.h"
-#include "atomic_cl.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define SKC_PLACE_SUBGROUP_MASK (SKC_PLACE_SUBGROUP_SIZE - 1)
-#define SKC_PLACE_SUBGROUP_LAST (SKC_PLACE_SUBGROUP_SIZE - 1)
-
-//
-//
-//
-
-#define SKC_PLACE_SMEM_COUNT_TTSK SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_SMEM_COUNT_TTPK SKC_RASTER_NODE_MAX_TTPK
-
-//
-//
-//
-
-#define SKC_PLACE_X (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE)
-
-//
-//
-//
-
-#if ( SKC_PLACE_X == 1 )
-#define SKC_PLACE_EXPAND() SKC_EXPAND_1()
-#define SKC_PLACE_EXPAND_I_LAST 0
-
-#elif ( SKC_PLACE_X == 2 )
-#define SKC_PLACE_EXPAND() SKC_EXPAND_2()
-#define SKC_PLACE_EXPAND_I_LAST 1
-
-#elif ( SKC_PLACE_X == 4 )
-#define SKC_PLACE_EXPAND() SKC_EXPAND_4()
-#define SKC_PLACE_EXPAND_I_LAST 3
-
-#elif ( SKC_PLACE_X == 8 )
-#define SKC_PLACE_EXPAND() SKC_EXPAND_8()
-#define SKC_PLACE_EXPAND_I_LAST 7
-
-#elif ( SKC_PLACE_X == 16)
-#define SKC_PLACE_EXPAND() SKC_EXPAND_16()
-#define SKC_PLACE_EXPAND_I_LAST 15
-#endif
-
-//
-// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE
-// COALESCED WRITES. LO FIRST, FOLLOWED BY HI.
-//
-// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE
-// KERNELS USE DIFFERENT SUBGROUP SIZES.
-//
-// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE
-// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID.
-//
-// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER
-// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY
-// ONLY SUPPORT A SUBGROUP SIZE OF 16.
-//
-
-#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE )
-
-#define SKC_PLACE_STRIDE_H(L) (L)
-#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE)
-
-#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_PLACE_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_RATIO - 1)
-#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I) ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK))
-
-#define SKC_PLACE_STRIDE_H(L) (L)
-#define SKC_PLACE_STRIDE_V_LO(I) (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE)
-
-#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_PLACE_SUBGROUP_RATIO (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
-#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
-
-#define SKC_PLACE_STRIDE_H(L) (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK))
-#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
-#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO)
-
-#endif
-
-//
-// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE
-// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8)
-//
-
-#define SKC_PLACE_IS_ALL_HEADER_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS)
-
-#define SKC_PLACE_IS_NOT_HEADER_ROW(i) ( (i) * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS)
-
-#define SKC_PLACE_IS_TRAILING_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS)
-
-#define SKC_PLACE_IS_HEADER_ROW_KEY(i) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
-
-
-//
-// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX
-//
-#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
-#define SKC_PLACE_NODE_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() < (k))
-
-//
-// TTSK v2:
-//
-// 0 63
-// | TTSB ID | PREFIX | SPAN | X | Y |
-// +---------+--------+---------+-----+-----+
-// | 27 | 1 (=0) | 12 (=0) | 12 | 12 |
-//
-//
-// TTPK v2:
-//
-// 0 63
-// | TTPB ID | PREFIX | SPAN | X | Y |
-// +---------+--------+------+-----+-----+
-// | 27 | 1 (=1) | 12 | 12 | 12 |
-//
-//
-
-//
-// TTCK (32-BIT COMPARE) v1:
-//
-// 0 63
-// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
-// +----------------------+--------+--------+-------+-----+-----+
-// | 30 | 1 | 1 | 18 | 7 | 7 |
-//
-//
-// TTCK (32-BIT COMPARE) v2:
-//
-// 0 63
-// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
-// +----------------------+--------+--------+-------+-----+-----+
-// | 30 | 1 | 1 | 15 | 9 | 8 |
-//
-//
-// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
-//
-// 0 63
-// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y |
-// +----------------------+--------+--------+-------+-----+-----+
-// | 27 | 1 | 1 | 18 | 9 | 8 |
-//
-
-union skc_subgroup_smem
-{
- skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE
-
- struct {
- struct {
- skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
- skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
- } lo;
-
- struct {
- skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
- skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
- } hi;
-
- // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK];
- };
-
-};
-
-//
-// scatter scan max
-//
-static
-skc_int_v_t
-skc_scatter_scan_max(__local union skc_subgroup_smem volatile * const smem,
- skc_int_v_t const iss,
- skc_int_v_t const ess)
-{
- //
- // prefix sums determine which lanes we're going to work on next
- //
- skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE);
- skc_int_v_t const scratch_idx = max(ess,0);
-
- //
- // SIMT
- //
-
- //
- // zero the volatile smem scratchpad using vector syntax
- //
- smem->scratch[get_sub_group_local_id()] = ( 0 );
-
- //
- // store source lane at starting lane
- //
- if (is_scratch_store) {
- smem->scratch[scratch_idx] = get_sub_group_local_id();
- }
-
- //
- // propagate lanes to right using max scan
- //
- skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()];
- skc_int_v_t const source = sub_group_scan_inclusive_max(scratch);
-
- return source;
-}
-
-//
-//
-//
-
-static
-skc_bool
-skc_xk_clip(union skc_tile_clip const * const tile_clip,
- skc_ttxk_t * const xk)
-{
- //
- // clip the sk and pk keys
- //
- // if fully clipped then return false
- //
- // alternatively -- we can expand all these keys in place
- //
- // alternatively -- keep sk and pk keys segregated because sk
- // represents the vast majority of keys and are easier to process.
- // don't mess with the fastpath!
- //
- return false;
-}
-
-//
-//
-//
-
-static
-skc_ttck_t
-skc_sk_to_ck(__local union skc_subgroup_smem volatile * const smem,
- union skc_cmd_place const * const cmd,
- skc_uint const sk_idx)
-{
- skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0
- skc_uint const hi = smem->hi.sk[sk_idx];
-
- skc_ttck_t ck;
-
- ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
-
- // FIXME -- x and y should already be clipped and shifted
- skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
- skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
-
- ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
-
- return ck;
-}
-
-static
-skc_ttck_t
-skc_pk_to_ck(__local union skc_subgroup_smem volatile * const smem,
- union skc_cmd_place const * const cmd,
- skc_uint const pk_idx,
- skc_uint const dx)
-{
- skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1
- skc_uint const hi = smem->hi.pk[pk_idx];
-
- skc_ttck_t ck;
-
- ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
-
- // FIXME -- x and y should already be clipped and shifted
- skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
- skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
-
- ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
-
- return ck;
-}
-
-//
-//
-//
-
-static
-void
-skc_ttsk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics,
- __global skc_ttck_t * const ck_extent,
- __local union skc_subgroup_smem volatile * const smem,
- union skc_cmd_place const * const cmd,
- skc_uint const sk)
-{
- //
- // Pretty sure you can never ever have an sk count equal to 0
- //
- skc_uint ck_base = 0;
-
- // last lane performs the block pool allocation with an atomic increment
- if (get_sub_group_local_id() == 0) {
- ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk);
- }
-
- // broadcast base to all lanes
- ck_base = sub_group_broadcast(ck_base,0);
-
- // convert sk keys to ck keys
- for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE)
- {
- ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii);
- }
-}
-
-//
-//
-//
-
-static
-skc_int
-skc_ttpk_get_span(__local union skc_subgroup_smem volatile * const smem,
- skc_uint const idx)
-{
- skc_uint const lo = smem->lo.pk[idx];
- skc_uint const hi = smem->hi.pk[idx];
-
- skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN;
- skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN;
-
- return (span_lo | span_hi) + 1;
-}
-
-//
-//
-//
-
-static
-void
-skc_ttpk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics,
- __global skc_ttck_t * const ck_extent,
- __local union skc_subgroup_smem volatile * const smem,
- union skc_cmd_place const * const cmd,
- skc_uint const pk)
-{
- // bail out if pk queue is empty
- if (pk == 0)
- return;
-
-#if 0
- if (get_sub_group_local_id() == 0)
- printf("%u\n",pk);
-#endif
-
- //
- // FIXME -- this nested loop iterates over the queue processing a
- // subgroup of 64-bit keys at a time. This is probably not the most
- // efficient approach so investigate how to store and iterate over a
- // wider than subgroup (node-sized) queue of keys.
- //
-
- // round up so we work with full subgroups
- skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK;
- skc_uint ii = 0;
-
- // nested loop that expands all ttpk keys
-#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE)
- for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE)
-#endif
- {
- skc_uint idx = ii + get_sub_group_local_id();
- skc_int span = 0;
-
- // how many tiles does this ttpk span?
- if (idx < pk)
- span = skc_ttpk_get_span(smem,idx);
-
- // we need inclusive, exclusive and total
- skc_int iss = sub_group_scan_inclusive_add(span);
- skc_int ess = iss - span;
- skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1);
-
- // printf("%u : %u\n",span,iss);
- // continue;
-
- // atomically allocate space for the pk keys
- skc_uint ck_base = 0;
-
- // last lane performs the block pool allocation with an atomic increment
- if (get_sub_group_local_id() == 0) {
- ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem);
- }
-
- // broadcast atomically allocated extent base to all lanes
- skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id();
-
- //
- // FIXME -- this loop would probably be faster if the ttpk keys
- // were held in registers and accessed with shuffles instead of
- // SMEM loads
- //
-
- //
- // loop until there are no more expanded pk keys
- //
- while (true)
- {
- skc_int const source = skc_scatter_scan_max(smem,iss,ess);
- skc_int const dx = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source);
-
- // store valid ck keys to gmem
- if (get_sub_group_local_id() < rem) {
- ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx);
- }
-
- // decrement remainder
- rem -= SKC_PLACE_SUBGROUP_SIZE;
-
- if (rem <= 0)
- break;
-
- // increment/decrement indices
- ck_idx += SKC_PLACE_SUBGROUP_SIZE;
- iss -= SKC_PLACE_SUBGROUP_SIZE;
- ess -= SKC_PLACE_SUBGROUP_SIZE;
- }
- }
-}
-
-//
-//
-//
-
-static
-skc_uint
-skc_ballot(skc_uint * const xk, skc_uint const is_xk)
-{
-#if 0
- //
- // FIXME -- when available, this should use the idiom:
- //
- // ballot() + lane_mask_less_than_or_equal + popcount()
- //
- // Supported by:
- //
- // - Vulkan 1.1 / SPIR-V 1.3
- // - CUDA
- // - AVX2 (SSE*?)
- //
-#else
- //
- // otherwise, emulate with an inclusive scan (yuk)
- //
- skc_uint const prefix = sub_group_scan_inclusive_add(is_xk);
-
- skc_uint const xk_idx = *xk + prefix - is_xk;
-
- *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST);
-
-#if 0
- printf("< %3u >\n",xk_idx);
-#endif
-
- return xk_idx;
-#endif
-}
-
-//
-//
-//
-__kernel
-SKC_PLACE_KERNEL_ATTRIBS
-void
-skc_kernel_place(__global skc_bp_elem_t * const bp_elems,
- __global SKC_ATOMIC_UINT volatile * const place_atomics,
- __global skc_ttck_t * const ck_extent,
- __global union skc_cmd_place const * const cmds,
- __global skc_block_id_t * const map,
- skc_uint4 const clip,
- skc_uint const count)
-{
- //
- // declare shared memory block
- //
-#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
- __local union skc_subgroup_smem volatile smem[1];
-#else
- __local union skc_subgroup_smem volatile smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS];
- __local union skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
-#endif
-
- //
- // This is a subgroup-centric kernel
- //
- // Which subgroup in the grid is this?
- //
- // TAKE NOTE: the Intel GEN compiler appears to be recognizing
- // get_group_id(0) as a uniform but the alternative calculation used
- // when there are multiple subgroups per workgroup is not
- // cooperating and driving spillage elsewhere.
- //
- // Test the raster's translated bounds against the composition's
- // tile clip
- //
- // There are 3 cases:
- //
- // - the raster is completely clipped -> return
- // - the raster is partially clipped -> all keys must clipped
- // - the raster is not clipped -> no keys are tested
- //
- //
- // There are at least 4 implementations of place and we want to
- // special-case them as much as possible so that, at the least, the
- // fastpath remains fast.
- //
- // - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP
- //
- // - implement CLIPPED + NO TRANSLATION path
- //
- // - implement NO CLIP + TRANSLATION path
- //
- // - implement CLIPPED + TRANSLATION path
- //
- //
- // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin
- // 12:12:8 integer where:
- //
- // 12: ttsk
- // 12: ttpk
- // 8: /dev/null -- clipped or invalid key
- //
- // Three kinds of nodes in a raster's list:
- //
- // - the head node
- // - an internal node
- // - the final node
- //
-
-#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
- skc_uint const cmd_idx = get_group_id(0);
-#else
- skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id();
-#endif
-
- // load command
- union skc_cmd_place const cmd = cmds[cmd_idx];
-
- // get the raster header from the raster host id -- scalar
- skc_block_id_t id = map[cmd.raster_h];
-
- //
- // load all of the head block ttxk keys into registers
- //
- // FIXME -- this pattern lends itself to using the higher
- // performance Intel GEN block load instructions
- //
- skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- union skc_raster_node_elem const h##I = { \
- .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)], \
- bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)] } \
- };
-
- SKC_PLACE_EXPAND();
-
- //
- // load raster header counts -- we only need the "nodes" and "keys"
- // words but the keys we loaded are doublewords.
- //
- // FIXME -- this can be made portable with compile-time macro expansion
- //
- skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
- skc_uint keys = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
-
- //
- //
- //
-#if 0
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \
- nodes,keys, \
- I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \
- h##I.u32v2.hi,h##I.u32v2.lo, \
- h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
-
- SKC_PLACE_EXPAND();
-#endif
-
- //
-#if 0
- if (get_sub_group_local_id() == 0) {
- printf("place: %u / %u / %u\n",head_id,nodes,keys);
- }
-#endif
-
- {
- //
- // classify every key in the header
- //
- // keys: 0 is not a key / 1 is a key
- // skpk: 0 is sk / 1 is pk
- //
- skc_uint bits_keys = 0;
- skc_uint bits_skpk = 0;
-
- //
- // calculate bits_keys
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
- skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \
- if (idx < keys) { \
- bits_keys |= (1u << I); \
- } \
- if (SKC_PLACE_IS_TRAILING_ROW(I)) { \
- if (keys > SKC_RASTER_HEAD_COUNT_KEYS) { \
- if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \
- bits_keys &= ~(1u << I); \
- } \
- } \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
- //
- // blindly calculate bits_skpk
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
- bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
- }
-
- SKC_PLACE_EXPAND();
-
-#if 0
- printf("%2X : %2X\n",bits_keys,bits_skpk);
-#endif
-
- //
- // next pointer is last element of last row. save it now because
- // this might be recognized as a subgroup-uniform/scalar.
- //
- id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
-
- //
- // append SK keys first
- //
- skc_uint const bits_sk = bits_keys & ~bits_skpk;
- skc_uint sk = 0;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
- skc_uint is_sk = (bits_sk >> I) & 1; \
- skc_uint sk_idx = skc_ballot(&sk,is_sk); \
- if (is_sk) { \
- smem->lo.sk[sk_idx] = h##I.xk.lo; \
- smem->hi.sk[sk_idx] = h##I.xk.hi; \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
- //
- // append PK keys next
- //
- skc_uint const bits_pk = bits_keys & bits_skpk;
- skc_uint pk = 0;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \
- skc_uint is_pk = (bits_pk >> I) & 1; \
- skc_uint pk_idx = skc_ballot(&pk,is_pk); \
- if (is_pk) { \
- smem->lo.pk[pk_idx] = h##I.xk.lo; \
- smem->hi.pk[pk_idx] = h##I.xk.hi; \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
-#if 0
- printf("%2u * %2u\n",sk,pk);
-#endif
- //
- // flush the keys
- //
- skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
- skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
- }
-
- //
- // we're done if there was only a head node
- //
- if (nodes == 0)
- return;
-
- //
- // decrement keys
- //
- keys -= SKC_RASTER_HEAD_COUNT_KEYS;
-
- //
- // otherwise, append keys in trailing nodes to smem
- //
- while (true)
- {
- //
- // load all of the node block ttxk keys into registers
- //
- // FIXME -- this pattern lends itself to using the higher
- // performance Intel GEN block load instructions
- //
- skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- union skc_raster_node_elem const n##I = { \
- .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)], \
- bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)] } \
- };
-
- SKC_PLACE_EXPAND();
-
-#if 0
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \
- nodes,keys, \
- I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \
- n##I.u32v2.hi,n##I.u32v2.lo, \
- n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
-
- SKC_PLACE_EXPAND();
-#endif
-
- //
- // classify every key in the header
- //
- // keys: 0 is not a key / 1 is a key
- // skpk: 0 is sk / 1 is pk
- //
- skc_uint bits_keys = 0;
- skc_uint bits_skpk = 0;
-
- //
- // calculate bits_keys
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) { \
- skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \
- if (idx < keys) { \
- bits_keys |= (1u << I); \
- } \
- if (SKC_PLACE_IS_TRAILING_ROW(I)) { \
- if (keys > SKC_RASTER_NODE_COUNT_KEYS) { \
- if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \
- bits_keys &= ~(1u << I); \
- } \
- } \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
- //
- // blindly calculate bits_skpk
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) { \
- bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
- }
-
- SKC_PLACE_EXPAND();
-
-#if 0
- printf("%2X : %2X\n",bits_keys,bits_skpk);
-#endif
-
- //
- // next pointer is last element of last row. save it now because
- // this might be recognized as a subgroup-uniform/scalar.
- //
- id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
-
- //
- // append SK keys first
- //
- skc_uint const bits_sk = bits_keys & ~bits_skpk;
- skc_uint sk = 0;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) { \
- skc_uint is_sk = (bits_sk >> I) & 1; \
- skc_uint sk_idx = skc_ballot(&sk,is_sk); \
- if (is_sk) { \
- smem->lo.sk[sk_idx] = n##I.xk.lo; \
- smem->hi.sk[sk_idx] = n##I.xk.hi; \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
- //
- // append PK keys next
- //
- skc_uint const bits_pk = bits_keys & bits_skpk;
- skc_uint pk = 0;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) { \
- skc_uint is_pk = (bits_pk >> I) & 1; \
- skc_uint pk_idx = skc_ballot(&pk,is_pk); \
- if (is_pk) { \
- smem->lo.pk[pk_idx] = n##I.xk.lo; \
- smem->hi.pk[pk_idx] = n##I.xk.hi; \
- } \
- }
-
- SKC_PLACE_EXPAND();
-
-#if 0
- printf("%2u * %2u\n",sk,pk);
-#endif
- //
- // if total for either the sk or pk queue reaches the
- // highwater mark then flush it to the extent
- //
- skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
- skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
-
- //
- // if this was the last node then we're done
- //
- if (--nodes == 0)
- return;
-
- //
- // otherwise decrement keys
- //
- keys -= SKC_RASTER_NODE_COUNT_KEYS;
- }
-}
-
-//
-//
-//
+/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "tile.h" +#include "common.h" +#include "raster.h" +#include "atomic_cl.h" +#include "kernel_cl_12.h" + +// +// +// + +#define SKC_PLACE_SUBGROUP_MASK (SKC_PLACE_SUBGROUP_SIZE - 1) +#define SKC_PLACE_SUBGROUP_LAST (SKC_PLACE_SUBGROUP_SIZE - 1) + +// +// +// + +#define SKC_PLACE_SMEM_COUNT_TTSK SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE) +#define SKC_PLACE_SMEM_COUNT_TTPK SKC_RASTER_NODE_MAX_TTPK + +// +// +// + +#define SKC_PLACE_X (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE) + +// +// +// + +#if ( SKC_PLACE_X == 1 ) +#define SKC_PLACE_EXPAND() SKC_EXPAND_1() +#define SKC_PLACE_EXPAND_I_LAST 0 + +#elif ( SKC_PLACE_X == 2 ) +#define SKC_PLACE_EXPAND() SKC_EXPAND_2() +#define SKC_PLACE_EXPAND_I_LAST 1 + +#elif ( SKC_PLACE_X == 4 ) +#define SKC_PLACE_EXPAND() SKC_EXPAND_4() +#define SKC_PLACE_EXPAND_I_LAST 3 + +#elif ( SKC_PLACE_X == 8 ) +#define SKC_PLACE_EXPAND() SKC_EXPAND_8() +#define SKC_PLACE_EXPAND_I_LAST 7 + +#elif ( SKC_PLACE_X == 16) +#define SKC_PLACE_EXPAND() SKC_EXPAND_16() +#define SKC_PLACE_EXPAND_I_LAST 15 +#endif + +// +// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE +// COALESCED WRITES. LO FIRST, FOLLOWED BY HI. +// +// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE +// KERNELS USE DIFFERENT SUBGROUP SIZES. +// +// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE +// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID. +// +// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER +// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY +// ONLY SUPPORT A SUBGROUP SIZE OF 16. +// + +#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE ) + +#define SKC_PLACE_STRIDE_H(L) (L) +#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE) +#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE) + +#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1 + +#define SKC_PLACE_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE) +#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_RATIO - 1) +#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I) ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK)) + +#define SKC_PLACE_STRIDE_H(L) (L) +#define SKC_PLACE_STRIDE_V_LO(I) (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE) +#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE) + +#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1 + +#define SKC_PLACE_SUBGROUP_RATIO (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE) +#define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask + +#define SKC_PLACE_STRIDE_H(L) (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK)) +#define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE) +#define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO) + +#endif + +// +// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE +// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8) +// + +#define SKC_PLACE_IS_ALL_HEADER_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS) + +#define SKC_PLACE_IS_NOT_HEADER_ROW(i) ( (i) * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS) + +#define SKC_PLACE_IS_TRAILING_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS) + +#define SKC_PLACE_IS_HEADER_ROW_KEY(i) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k)) + + +// +// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX +// +#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k)) +#define SKC_PLACE_NODE_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() < (k)) + +// +// TTSK v2: +// +// 0 63 +// | TTSB ID | PREFIX | SPAN | X | Y | +// +---------+--------+---------+-----+-----+ +// | 27 | 1 (=0) | 12 (=0) | 12 | 12 | +// +// +// TTPK v2: +// +// 0 63 +// | TTPB ID | PREFIX | SPAN | X | Y | +// +---------+--------+------+-----+-----+ +// | 27 | 1 (=1) | 12 | 12 | 12 | +// +// + +// +// TTCK (32-BIT COMPARE) v1: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 30 | 1 | 1 | 18 | 7 | 7 | +// +// +// TTCK (32-BIT COMPARE) v2: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 30 | 1 | 1 | 15 | 9 | 8 | +// +// +// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile: +// +// 0 63 +// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | +// +----------------------+--------+--------+-------+-----+-----+ +// | 27 | 1 | 1 | 18 | 9 | 8 | +// + +union skc_subgroup_smem +{ + skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE + + struct { + struct { + skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK]; + skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK]; + } lo; + + struct { + skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK]; + skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK]; + } hi; + + // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK]; + }; + +}; + +// +// scatter scan max +// +static +skc_int_v_t +skc_scatter_scan_max(__local union skc_subgroup_smem volatile * const smem, + skc_int_v_t const iss, + skc_int_v_t const ess) +{ + // + // prefix sums determine which lanes we're going to work on next + // + skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE); + skc_int_v_t const scratch_idx = max(ess,0); + + // + // SIMT + // + + // + // zero the volatile smem scratchpad using vector syntax + // + smem->scratch[get_sub_group_local_id()] = ( 0 ); + + // + // store source lane at starting lane + // + if (is_scratch_store) { + smem->scratch[scratch_idx] = get_sub_group_local_id(); + } + + // + // propagate lanes to right using max scan + // + skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()]; + skc_int_v_t const source = sub_group_scan_inclusive_max(scratch); + + return source; +} + +// +// +// + +static +skc_bool +skc_xk_clip(union skc_tile_clip const * const tile_clip, + skc_ttxk_t * const xk) +{ + // + // clip the sk and pk keys + // + // if fully clipped then return false + // + // alternatively -- we can expand all these keys in place + // + // alternatively -- keep sk and pk keys segregated because sk + // represents the vast majority of keys and are easier to process. + // don't mess with the fastpath! + // + return false; +} + +// +// +// + +static +skc_ttck_t +skc_sk_to_ck(__local union skc_subgroup_smem volatile * const smem, + union skc_cmd_place const * const cmd, + skc_uint const sk_idx) +{ + skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0 + skc_uint const hi = smem->hi.sk[sk_idx]; + + skc_ttck_t ck; + + ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id + + // FIXME -- x and y should already be clipped and shifted + skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X; + skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y; + + ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y; + + return ck; +} + +static +skc_ttck_t +skc_pk_to_ck(__local union skc_subgroup_smem volatile * const smem, + union skc_cmd_place const * const cmd, + skc_uint const pk_idx, + skc_uint const dx) +{ + skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1 + skc_uint const hi = smem->hi.pk[pk_idx]; + + skc_ttck_t ck; + + ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id + + // FIXME -- x and y should already be clipped and shifted + skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X; + skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y; + + ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y; + + return ck; +} + +// +// +// + +static +void +skc_ttsk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics, + __global skc_ttck_t * const ck_extent, + __local union skc_subgroup_smem volatile * const smem, + union skc_cmd_place const * const cmd, + skc_uint const sk) +{ + // + // Pretty sure you can never ever have an sk count equal to 0 + // + skc_uint ck_base = 0; + + // last lane performs the block pool allocation with an atomic increment + if (get_sub_group_local_id() == 0) { + ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk); + } + + // broadcast base to all lanes + ck_base = sub_group_broadcast(ck_base,0); + + // convert sk keys to ck keys + for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE) + { + ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii); + } +} + +// +// +// + +static +skc_int +skc_ttpk_get_span(__local union skc_subgroup_smem volatile * const smem, + skc_uint const idx) +{ + skc_uint const lo = smem->lo.pk[idx]; + skc_uint const hi = smem->hi.pk[idx]; + + skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN; + skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN; + + return (span_lo | span_hi) + 1; +} + +// +// +// + +static +void +skc_ttpk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics, + __global skc_ttck_t * const ck_extent, + __local union skc_subgroup_smem volatile * const smem, + union skc_cmd_place const * const cmd, + skc_uint const pk) +{ + // bail out if pk queue is empty + if (pk == 0) + return; + +#if 0 + if (get_sub_group_local_id() == 0) + printf("%u\n",pk); +#endif + + // + // FIXME -- this nested loop iterates over the queue processing a + // subgroup of 64-bit keys at a time. This is probably not the most + // efficient approach so investigate how to store and iterate over a + // wider than subgroup (node-sized) queue of keys. + // + + // round up so we work with full subgroups + skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK; + skc_uint ii = 0; + + // nested loop that expands all ttpk keys +#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE) + for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE) +#endif + { + skc_uint idx = ii + get_sub_group_local_id(); + skc_int span = 0; + + // how many tiles does this ttpk span? + if (idx < pk) + span = skc_ttpk_get_span(smem,idx); + + // we need inclusive, exclusive and total + skc_int iss = sub_group_scan_inclusive_add(span); + skc_int ess = iss - span; + skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1); + + // printf("%u : %u\n",span,iss); + // continue; + + // atomically allocate space for the pk keys + skc_uint ck_base = 0; + + // last lane performs the block pool allocation with an atomic increment + if (get_sub_group_local_id() == 0) { + ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem); + } + + // broadcast atomically allocated extent base to all lanes + skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id(); + + // + // FIXME -- this loop would probably be faster if the ttpk keys + // were held in registers and accessed with shuffles instead of + // SMEM loads + // + + // + // loop until there are no more expanded pk keys + // + while (true) + { + skc_int const source = skc_scatter_scan_max(smem,iss,ess); + skc_int const dx = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source); + + // store valid ck keys to gmem + if (get_sub_group_local_id() < rem) { + ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx); + } + + // decrement remainder + rem -= SKC_PLACE_SUBGROUP_SIZE; + + if (rem <= 0) + break; + + // increment/decrement indices + ck_idx += SKC_PLACE_SUBGROUP_SIZE; + iss -= SKC_PLACE_SUBGROUP_SIZE; + ess -= SKC_PLACE_SUBGROUP_SIZE; + } + } +} + +// +// +// + +static +skc_uint +skc_ballot(skc_uint * const xk, skc_uint const is_xk) +{ +#if 0 + // + // FIXME -- when available, this should use the idiom: + // + // ballot() + lane_mask_less_than_or_equal + popcount() + // + // Supported by: + // + // - Vulkan 1.1 / SPIR-V 1.3 + // - CUDA + // - AVX2 (SSE*?) + // +#else + // + // otherwise, emulate with an inclusive scan (yuk) + // + skc_uint const prefix = sub_group_scan_inclusive_add(is_xk); + + skc_uint const xk_idx = *xk + prefix - is_xk; + + *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST); + +#if 0 + printf("< %3u >\n",xk_idx); +#endif + + return xk_idx; +#endif +} + +// +// +// +__kernel +SKC_PLACE_KERNEL_ATTRIBS +void +skc_kernel_place(__global skc_bp_elem_t * const bp_elems, + __global SKC_ATOMIC_UINT volatile * const place_atomics, + __global skc_ttck_t * const ck_extent, + __global union skc_cmd_place const * const cmds, + __global skc_block_id_t * const map, + skc_uint4 const clip, + skc_uint const count) +{ + // + // declare shared memory block + // +#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 ) + __local union skc_subgroup_smem volatile smem[1]; +#else + __local union skc_subgroup_smem volatile smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS]; + __local union skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); +#endif + + // + // This is a subgroup-centric kernel + // + // Which subgroup in the grid is this? + // + // TAKE NOTE: the Intel GEN compiler appears to be recognizing + // get_group_id(0) as a uniform but the alternative calculation used + // when there are multiple subgroups per workgroup is not + // cooperating and driving spillage elsewhere. + // + // Test the raster's translated bounds against the composition's + // tile clip + // + // There are 3 cases: + // + // - the raster is completely clipped -> return + // - the raster is partially clipped -> all keys must clipped + // - the raster is not clipped -> no keys are tested + // + // + // There are at least 4 implementations of place and we want to + // special-case them as much as possible so that, at the least, the + // fastpath remains fast. + // + // - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP + // + // - implement CLIPPED + NO TRANSLATION path + // + // - implement NO CLIP + TRANSLATION path + // + // - implement CLIPPED + TRANSLATION path + // + // + // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin + // 12:12:8 integer where: + // + // 12: ttsk + // 12: ttpk + // 8: /dev/null -- clipped or invalid key + // + // Three kinds of nodes in a raster's list: + // + // - the head node + // - an internal node + // - the final node + // + +#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 ) + skc_uint const cmd_idx = get_group_id(0); +#else + skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id(); +#endif + + // load command + union skc_cmd_place const cmd = cmds[cmd_idx]; + + // get the raster header from the raster host id -- scalar + skc_block_id_t id = map[cmd.raster_h]; + + // + // load all of the head block ttxk keys into registers + // + // FIXME -- this pattern lends itself to using the higher + // performance Intel GEN block load instructions + // + skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id()); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + union skc_raster_node_elem const h##I = { \ + .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)], \ + bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)] } \ + }; + + SKC_PLACE_EXPAND(); + + // + // load raster header counts -- we only need the "nodes" and "keys" + // words but the keys we loaded are doublewords. + // + // FIXME -- this can be made portable with compile-time macro expansion + // + skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES + skc_uint keys = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS + + // + // + // +#if 0 +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \ + nodes,keys, \ + I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \ + h##I.u32v2.hi,h##I.u32v2.lo, \ + h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX); + + SKC_PLACE_EXPAND(); +#endif + + // +#if 0 + if (get_sub_group_local_id() == 0) { + printf("place: %u / %u / %u\n",head_id,nodes,keys); + } +#endif + + { + // + // classify every key in the header + // + // keys: 0 is not a key / 1 is a key + // skpk: 0 is sk / 1 is pk + // + skc_uint bits_keys = 0; + skc_uint bits_skpk = 0; + + // + // calculate bits_keys + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ + skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \ + if (idx < keys) { \ + bits_keys |= (1u << I); \ + } \ + if (SKC_PLACE_IS_TRAILING_ROW(I)) { \ + if (keys > SKC_RASTER_HEAD_COUNT_KEYS) { \ + if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \ + bits_keys &= ~(1u << I); \ + } \ + } \ + } \ + } + + SKC_PLACE_EXPAND(); + + // + // blindly calculate bits_skpk + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ + bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \ + } + + SKC_PLACE_EXPAND(); + +#if 0 + printf("%2X : %2X\n",bits_keys,bits_skpk); +#endif + + // + // next pointer is last element of last row. save it now because + // this might be recognized as a subgroup-uniform/scalar. + // + id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST); + + // + // append SK keys first + // + skc_uint const bits_sk = bits_keys & ~bits_skpk; + skc_uint sk = 0; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ + skc_uint is_sk = (bits_sk >> I) & 1; \ + skc_uint sk_idx = skc_ballot(&sk,is_sk); \ + if (is_sk) { \ + smem->lo.sk[sk_idx] = h##I.xk.lo; \ + smem->hi.sk[sk_idx] = h##I.xk.hi; \ + } \ + } + + SKC_PLACE_EXPAND(); + + // + // append PK keys next + // + skc_uint const bits_pk = bits_keys & bits_skpk; + skc_uint pk = 0; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ + skc_uint is_pk = (bits_pk >> I) & 1; \ + skc_uint pk_idx = skc_ballot(&pk,is_pk); \ + if (is_pk) { \ + smem->lo.pk[pk_idx] = h##I.xk.lo; \ + smem->hi.pk[pk_idx] = h##I.xk.hi; \ + } \ + } + + SKC_PLACE_EXPAND(); + +#if 0 + printf("%2u * %2u\n",sk,pk); +#endif + // + // flush the keys + // + skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk); + skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk); + } + + // + // we're done if there was only a head node + // + if (nodes == 0) + return; + + // + // decrement keys + // + keys -= SKC_RASTER_HEAD_COUNT_KEYS; + + // + // otherwise, append keys in trailing nodes to smem + // + while (true) + { + // + // load all of the node block ttxk keys into registers + // + // FIXME -- this pattern lends itself to using the higher + // performance Intel GEN block load instructions + // + skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id()); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + union skc_raster_node_elem const n##I = { \ + .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)], \ + bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)] } \ + }; + + SKC_PLACE_EXPAND(); + +#if 0 +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \ + nodes,keys, \ + I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \ + n##I.u32v2.hi,n##I.u32v2.lo, \ + n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX); + + SKC_PLACE_EXPAND(); +#endif + + // + // classify every key in the header + // + // keys: 0 is not a key / 1 is a key + // skpk: 0 is sk / 1 is pk + // + skc_uint bits_keys = 0; + skc_uint bits_skpk = 0; + + // + // calculate bits_keys + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) { \ + skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \ + if (idx < keys) { \ + bits_keys |= (1u << I); \ + } \ + if (SKC_PLACE_IS_TRAILING_ROW(I)) { \ + if (keys > SKC_RASTER_NODE_COUNT_KEYS) { \ + if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \ + bits_keys &= ~(1u << I); \ + } \ + } \ + } \ + } + + SKC_PLACE_EXPAND(); + + // + // blindly calculate bits_skpk + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) { \ + bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \ + } + + SKC_PLACE_EXPAND(); + +#if 0 + printf("%2X : %2X\n",bits_keys,bits_skpk); +#endif + + // + // next pointer is last element of last row. save it now because + // this might be recognized as a subgroup-uniform/scalar. + // + id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST); + + // + // append SK keys first + // + skc_uint const bits_sk = bits_keys & ~bits_skpk; + skc_uint sk = 0; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) { \ + skc_uint is_sk = (bits_sk >> I) & 1; \ + skc_uint sk_idx = skc_ballot(&sk,is_sk); \ + if (is_sk) { \ + smem->lo.sk[sk_idx] = n##I.xk.lo; \ + smem->hi.sk[sk_idx] = n##I.xk.hi; \ + } \ + } + + SKC_PLACE_EXPAND(); + + // + // append PK keys next + // + skc_uint const bits_pk = bits_keys & bits_skpk; + skc_uint pk = 0; + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) { \ + skc_uint is_pk = (bits_pk >> I) & 1; \ + skc_uint pk_idx = skc_ballot(&pk,is_pk); \ + if (is_pk) { \ + smem->lo.pk[pk_idx] = n##I.xk.lo; \ + smem->hi.pk[pk_idx] = n##I.xk.hi; \ + } \ + } + + SKC_PLACE_EXPAND(); + +#if 0 + printf("%2u * %2u\n",sk,pk); +#endif + // + // if total for either the sk or pk queue reaches the + // highwater mark then flush it to the extent + // + skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk); + skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk); + + // + // if this was the last node then we're done + // + if (--nodes == 0) + return; + + // + // otherwise decrement keys + // + keys -= SKC_RASTER_NODE_COUNT_KEYS; + } +} + +// +// +// |