/* * Copyright 2017 Google Inc. * * Use of this source code is governed by a BSD-style license that can * be found in the LICENSE file. * */ // // // #include "tile.h" #include "common.h" #include "raster.h" #include "atomic_cl.h" #include "kernel_cl_12.h" // // // #define SKC_PLACE_SUBGROUP_MASK (SKC_PLACE_SUBGROUP_SIZE - 1) #define SKC_PLACE_SUBGROUP_LAST (SKC_PLACE_SUBGROUP_SIZE - 1) // // // #define SKC_PLACE_SMEM_COUNT_TTSK SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE) #define SKC_PLACE_SMEM_COUNT_TTPK SKC_RASTER_NODE_MAX_TTPK // // // #define SKC_PLACE_X (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE) // // // #if ( SKC_PLACE_X == 1 ) #define SKC_PLACE_EXPAND() SKC_EXPAND_1() #define SKC_PLACE_EXPAND_I_LAST 0 #elif ( SKC_PLACE_X == 2 ) #define SKC_PLACE_EXPAND() SKC_EXPAND_2() #define SKC_PLACE_EXPAND_I_LAST 1 #elif ( SKC_PLACE_X == 4 ) #define SKC_PLACE_EXPAND() SKC_EXPAND_4() #define SKC_PLACE_EXPAND_I_LAST 3 #elif ( SKC_PLACE_X == 8 ) #define SKC_PLACE_EXPAND() SKC_EXPAND_8() #define SKC_PLACE_EXPAND_I_LAST 7 #elif ( SKC_PLACE_X == 16) #define SKC_PLACE_EXPAND() SKC_EXPAND_16() #define SKC_PLACE_EXPAND_I_LAST 15 #endif // // PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE // COALESCED WRITES. LO FIRST, FOLLOWED BY HI. // // THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE // KERNELS USE DIFFERENT SUBGROUP SIZES. // // THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE // LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID. // // NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER // OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY // ONLY SUPPORT A SUBGROUP SIZE OF 16. // #if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE ) #define SKC_PLACE_STRIDE_H(L) (L) #define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE) #define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE) #elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1 #define SKC_PLACE_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE) #define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_RATIO - 1) #define SKC_PLACE_SUBGROUP_RATIO_SCALE(I) ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK)) #define SKC_PLACE_STRIDE_H(L) (L) #define SKC_PLACE_STRIDE_V_LO(I) (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE) #define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE) #elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1 #define SKC_PLACE_SUBGROUP_RATIO (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE) #define SKC_PLACE_SUBGROUP_RATIO_MASK (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask #define SKC_PLACE_STRIDE_H(L) (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK)) #define SKC_PLACE_STRIDE_V_LO(I) (I * 2 * SKC_PLACE_SUBGROUP_SIZE) #define SKC_PLACE_STRIDE_V_HI(I) (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO) #endif // // A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE // IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8) // #define SKC_PLACE_IS_ALL_HEADER_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS) #define SKC_PLACE_IS_NOT_HEADER_ROW(i) ( (i) * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS) #define SKC_PLACE_IS_TRAILING_ROW(i) (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS) #define SKC_PLACE_IS_HEADER_ROW_KEY(i) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k)) // // Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX // #define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k)) #define SKC_PLACE_NODE_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() < (k)) // // TTSK v2: // // 0 63 // | TTSB ID | PREFIX | SPAN | X | Y | // +---------+--------+---------+-----+-----+ // | 27 | 1 (=0) | 12 (=0) | 12 | 12 | // // // TTPK v2: // // 0 63 // | TTPB ID | PREFIX | SPAN | X | Y | // +---------+--------+------+-----+-----+ // | 27 | 1 (=1) | 12 | 12 | 12 | // // // // TTCK (32-BIT COMPARE) v1: // // 0 63 // | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | // +----------------------+--------+--------+-------+-----+-----+ // | 30 | 1 | 1 | 18 | 7 | 7 | // // // TTCK (32-BIT COMPARE) v2: // // 0 63 // | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | // +----------------------+--------+--------+-------+-----+-----+ // | 30 | 1 | 1 | 15 | 9 | 8 | // // // TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile: // // 0 63 // | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | // +----------------------+--------+--------+-------+-----+-----+ // | 27 | 1 | 1 | 18 | 9 | 8 | // union skc_subgroup_smem { skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE struct { struct { skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK]; skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK]; } lo; struct { skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK]; skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK]; } hi; // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK]; }; }; // // scatter scan max // static skc_int_v_t skc_scatter_scan_max(__local union skc_subgroup_smem volatile * const smem, skc_int_v_t const iss, skc_int_v_t const ess) { // // prefix sums determine which lanes we're going to work on next // skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE); skc_int_v_t const scratch_idx = max(ess,0); // // SIMT // // // zero the volatile smem scratchpad using vector syntax // smem->scratch[get_sub_group_local_id()] = ( 0 ); // // store source lane at starting lane // if (is_scratch_store) { smem->scratch[scratch_idx] = get_sub_group_local_id(); } // // propagate lanes to right using max scan // skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()]; skc_int_v_t const source = sub_group_scan_inclusive_max(scratch); return source; } // // // static skc_bool skc_xk_clip(union skc_tile_clip const * const tile_clip, skc_ttxk_t * const xk) { // // clip the sk and pk keys // // if fully clipped then return false // // alternatively -- we can expand all these keys in place // // alternatively -- keep sk and pk keys segregated because sk // represents the vast majority of keys and are easier to process. // don't mess with the fastpath! // return false; } // // // static skc_ttck_t skc_sk_to_ck(__local union skc_subgroup_smem volatile * const smem, union skc_cmd_place const * const cmd, skc_uint const sk_idx) { skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0 skc_uint const hi = smem->hi.sk[sk_idx]; skc_ttck_t ck; ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id // FIXME -- x and y should already be clipped and shifted skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X; skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y; ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y; return ck; } static skc_ttck_t skc_pk_to_ck(__local union skc_subgroup_smem volatile * const smem, union skc_cmd_place const * const cmd, skc_uint const pk_idx, skc_uint const dx) { skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1 skc_uint const hi = smem->hi.pk[pk_idx]; skc_ttck_t ck; ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id // FIXME -- x and y should already be clipped and shifted skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X; skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y; ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y; return ck; } // // // static void skc_ttsk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics, __global skc_ttck_t * const ck_extent, __local union skc_subgroup_smem volatile * const smem, union skc_cmd_place const * const cmd, skc_uint const sk) { // // Pretty sure you can never ever have an sk count equal to 0 // skc_uint ck_base = 0; // last lane performs the block pool allocation with an atomic increment if (get_sub_group_local_id() == 0) { ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk); } // broadcast base to all lanes ck_base = sub_group_broadcast(ck_base,0); // convert sk keys to ck keys for (skc_uint ii=get_sub_group_local_id(); iilo.pk[idx]; skc_uint const hi = smem->hi.pk[idx]; skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN; skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN; return (span_lo | span_hi) + 1; } // // // static void skc_ttpk_flush(__global SKC_ATOMIC_UINT volatile * const place_atomics, __global skc_ttck_t * const ck_extent, __local union skc_subgroup_smem volatile * const smem, union skc_cmd_place const * const cmd, skc_uint const pk) { // bail out if pk queue is empty if (pk == 0) return; #if 0 if (get_sub_group_local_id() == 0) printf("%u\n",pk); #endif // // FIXME -- this nested loop iterates over the queue processing a // subgroup of 64-bit keys at a time. This is probably not the most // efficient approach so investigate how to store and iterate over a // wider than subgroup (node-sized) queue of keys. // // round up so we work with full subgroups skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK; skc_uint ii = 0; // nested loop that expands all ttpk keys #if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE) for (; ii\n",xk_idx); #endif return xk_idx; #endif } // // // __kernel SKC_PLACE_KERNEL_ATTRIBS void skc_kernel_place(__global skc_bp_elem_t * const bp_elems, __global SKC_ATOMIC_UINT volatile * const place_atomics, __global skc_ttck_t * const ck_extent, __global union skc_cmd_place const * const cmds, __global skc_block_id_t * const map, skc_uint4 const clip, skc_uint const count) { // // declare shared memory block // #if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 ) __local union skc_subgroup_smem volatile smem[1]; #else __local union skc_subgroup_smem volatile smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS]; __local union skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); #endif // // This is a subgroup-centric kernel // // Which subgroup in the grid is this? // // TAKE NOTE: the Intel GEN compiler appears to be recognizing // get_group_id(0) as a uniform but the alternative calculation used // when there are multiple subgroups per workgroup is not // cooperating and driving spillage elsewhere. // // Test the raster's translated bounds against the composition's // tile clip // // There are 3 cases: // // - the raster is completely clipped -> return // - the raster is partially clipped -> all keys must clipped // - the raster is not clipped -> no keys are tested // // // There are at least 4 implementations of place and we want to // special-case them as much as possible so that, at the least, the // fastpath remains fast. // // - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP // // - implement CLIPPED + NO TRANSLATION path // // - implement NO CLIP + TRANSLATION path // // - implement CLIPPED + TRANSLATION path // // // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin // 12:12:8 integer where: // // 12: ttsk // 12: ttpk // 8: /dev/null -- clipped or invalid key // // Three kinds of nodes in a raster's list: // // - the head node // - an internal node // - the final node // #if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 ) skc_uint const cmd_idx = get_group_id(0); #else skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id(); #endif // load command union skc_cmd_place const cmd = cmds[cmd_idx]; // get the raster header from the raster host id -- scalar skc_block_id_t id = map[cmd.raster_h]; // // load all of the head block ttxk keys into registers // // FIXME -- this pattern lends itself to using the higher // performance Intel GEN block load instructions // skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id()); #undef SKC_EXPAND_X #define SKC_EXPAND_X(I,S,C,P,R) \ union skc_raster_node_elem const h##I = { \ .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)], \ bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)] } \ }; SKC_PLACE_EXPAND(); // // load raster header counts -- we only need the "nodes" and "keys" // words but the keys we loaded are doublewords. // // FIXME -- this can be made portable with compile-time macro expansion // skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES skc_uint keys = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS // // // #if 0 #undef SKC_EXPAND_X #define SKC_EXPAND_X(I,S,C,P,R) \ printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \ nodes,keys, \ I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \ h##I.u32v2.hi,h##I.u32v2.lo, \ h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX); SKC_PLACE_EXPAND(); #endif // #if 0 if (get_sub_group_local_id() == 0) { printf("place: %u / %u / %u\n",head_id,nodes,keys); } #endif { // // classify every key in the header // // keys: 0 is not a key / 1 is a key // skpk: 0 is sk / 1 is pk // skc_uint bits_keys = 0; skc_uint bits_skpk = 0; // // calculate bits_keys // #undef SKC_EXPAND_X #define SKC_EXPAND_X(I,S,C,P,R) \ if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \ if (idx < keys) { \ bits_keys |= (1u << I); \ } \ if (SKC_PLACE_IS_TRAILING_ROW(I)) { \ if (keys > SKC_RASTER_HEAD_COUNT_KEYS) { \ if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \ bits_keys &= ~(1u << I); \ } \ } \ } \ } SKC_PLACE_EXPAND(); // // blindly calculate bits_skpk // #undef SKC_EXPAND_X #define SKC_EXPAND_X(I,S,C,P,R) \ if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \ } SKC_PLACE_EXPAND(); #if 0 printf("%2X : %2X\n",bits_keys,bits_skpk); #endif // // next pointer is last element of last row. save it now because // this might be recognized as a subgroup-uniform/scalar. // id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST); // // append SK keys first // skc_uint const bits_sk = bits_keys & ~bits_skpk; skc_uint sk = 0; #undef SKC_EXPAND_X #define SKC_EXPAND_X(I,S,C,P,R) \ if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ skc_uint is_sk = (bits_sk >> I) & 1; \ skc_uint sk_idx = skc_ballot(&sk,is_sk); \ if (is_sk) { \ smem->lo.sk[sk_idx] = h##I.xk.lo; \ smem->hi.sk[sk_idx] = h##I.xk.hi; \ } \ } SKC_PLACE_EXPAND(); // // append PK keys next // skc_uint const bits_pk = bits_keys & bits_skpk; skc_uint pk = 0; #undef SKC_EXPAND_X #define SKC_EXPAND_X(I,S,C,P,R) \ if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) { \ skc_uint is_pk = (bits_pk >> I) & 1; \ skc_uint pk_idx = skc_ballot(&pk,is_pk); \ if (is_pk) { \ smem->lo.pk[pk_idx] = h##I.xk.lo; \ smem->hi.pk[pk_idx] = h##I.xk.hi; \ } \ } SKC_PLACE_EXPAND(); #if 0 printf("%2u * %2u\n",sk,pk); #endif // // flush the keys // skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk); skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk); } // // we're done if there was only a head node // if (nodes == 0) return; // // decrement keys // keys -= SKC_RASTER_HEAD_COUNT_KEYS; // // otherwise, append keys in trailing nodes to smem // while (true) { // // load all of the node block ttxk keys into registers // // FIXME -- this pattern lends itself to using the higher // performance Intel GEN block load instructions // skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id()); #undef SKC_EXPAND_X #define SKC_EXPAND_X(I,S,C,P,R) \ union skc_raster_node_elem const n##I = { \ .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)], \ bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)] } \ }; SKC_PLACE_EXPAND(); #if 0 #undef SKC_EXPAND_X #define SKC_EXPAND_X(I,S,C,P,R) \ printf("%5u : %6u : %3u : %08X . %08X - %08X\n", \ nodes,keys, \ I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(), \ n##I.u32v2.hi,n##I.u32v2.lo, \ n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX); SKC_PLACE_EXPAND(); #endif // // classify every key in the header // // keys: 0 is not a key / 1 is a key // skpk: 0 is sk / 1 is pk // skc_uint bits_keys = 0; skc_uint bits_skpk = 0; // // calculate bits_keys // #undef SKC_EXPAND_X #define SKC_EXPAND_X(I,S,C,P,R) { \ skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \ if (idx < keys) { \ bits_keys |= (1u << I); \ } \ if (SKC_PLACE_IS_TRAILING_ROW(I)) { \ if (keys > SKC_RASTER_NODE_COUNT_KEYS) { \ if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) { \ bits_keys &= ~(1u << I); \ } \ } \ } \ } SKC_PLACE_EXPAND(); // // blindly calculate bits_skpk // #undef SKC_EXPAND_X #define SKC_EXPAND_X(I,S,C,P,R) { \ bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \ } SKC_PLACE_EXPAND(); #if 0 printf("%2X : %2X\n",bits_keys,bits_skpk); #endif // // next pointer is last element of last row. save it now because // this might be recognized as a subgroup-uniform/scalar. // id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST); // // append SK keys first // skc_uint const bits_sk = bits_keys & ~bits_skpk; skc_uint sk = 0; #undef SKC_EXPAND_X #define SKC_EXPAND_X(I,S,C,P,R) { \ skc_uint is_sk = (bits_sk >> I) & 1; \ skc_uint sk_idx = skc_ballot(&sk,is_sk); \ if (is_sk) { \ smem->lo.sk[sk_idx] = n##I.xk.lo; \ smem->hi.sk[sk_idx] = n##I.xk.hi; \ } \ } SKC_PLACE_EXPAND(); // // append PK keys next // skc_uint const bits_pk = bits_keys & bits_skpk; skc_uint pk = 0; #undef SKC_EXPAND_X #define SKC_EXPAND_X(I,S,C,P,R) { \ skc_uint is_pk = (bits_pk >> I) & 1; \ skc_uint pk_idx = skc_ballot(&pk,is_pk); \ if (is_pk) { \ smem->lo.pk[pk_idx] = n##I.xk.lo; \ smem->hi.pk[pk_idx] = n##I.xk.hi; \ } \ } SKC_PLACE_EXPAND(); #if 0 printf("%2u * %2u\n",sk,pk); #endif // // if total for either the sk or pk queue reaches the // highwater mark then flush it to the extent // skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk); skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk); // // if this was the last node then we're done // if (--nodes == 0) return; // // otherwise decrement keys // keys -= SKC_RASTER_NODE_COUNT_KEYS; } } // // //