diff options
Diffstat (limited to 'src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl')
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl | 884 |
1 files changed, 442 insertions, 442 deletions
diff --git a/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl b/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl index 27411cfe96..b0eb7ea7ae 100644 --- a/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl +++ b/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl @@ -1,442 +1,442 @@ -/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "tile.h"
-#include "block.h"
-#include "raster.h"
-#include "common.h"
-#include "atomic_cl.h"
-#include "block_pool_cl.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
-
-#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS)
-
-#define SKC_RASTERS_RECLAIM_X (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS)
-
-//
-//
-//
-
-#if ( SKC_RASTERS_RECLAIM_X == 1 )
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 0
-
-#elif ( SKC_RASTERS_RECLAIM_X == 2 )
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 1
-
-#elif ( SKC_RASTERS_RECLAIM_X == 4 )
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 3
-
-#elif ( SKC_RASTERS_RECLAIM_X == 8 )
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 7
-
-#elif ( SKC_RASTERS_RECLAIM_X == 16)
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 15
-
-#else
-#error "MISSING SKC_RASTERS_RECLAIM_X"
-#endif
-
-#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE )
-
-#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1)
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \
- (I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
-
-#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
-
-#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
-#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO)
-
-#endif
-
-//
-// FIXME -- slate these for replacement
-//
-
-#define SKC_BROADCAST(E,S,I) \
- sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_BROADCAST_LAST_HELPER(E,I) \
- sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
-
-#define SKC_BROADCAST_LAST(E,I) \
- SKC_BROADCAST_LAST_HELPER(E,I)
-
-//
-// COMPILE-TIME PREDICATES
-//
-
-#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I) \
- SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I) \
- (skc_bool)SKC_GTE_MACRO(X, I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \
- (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I) \
- SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I)
-
-#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I) \
- SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I)
-
-//
-// RUN-TIME PREDICATES
-//
-
-#define SKC_RASTERS_RECLAIM_IS_HEADER(I) \
- (get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS)
-
-//
-// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
-// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
-// COMBOS (NOT NECESSARILY POW2)
-//
-// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
-// UINT TYPE INSTEAD OF A ULONG.
-//
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE skc_uint
-
-//
-//
-//
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS)
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \
- (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \
- ? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I))
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \
- S = sub_group_scan_exclusive_add(C)
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I) \
- (((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK)
-
-//
-//
-//
-
-struct skc_reclaim
-{
- skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE];
-};
-
-__kernel
-SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS
-void
-skc_kernel_rasters_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring
- __global skc_uint * const bp_elems, // block pool blocks
- __global skc_uint volatile * const bp_atomics, // read/write atomics
- skc_uint const bp_mask, // pow2 modulo mask for block pool ring
- __global skc_block_id_t const * const map, // raster host-to-device map
- struct skc_reclaim const reclaim) // array of host raster ids
-{
-#if (__OPENCL_VERSION__ < 200)
- skc_uint const reclaim_stride = get_num_sub_groups();
-#else
- skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
-#endif
- skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id();
-
-#if 0
- //
- // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
- // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
- // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
- // RECLAMATION JOB ON THE REST OF THE PIPELINE.
- //
- for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
-#endif
- {
- // get host raster id
- skc_raster_h const raster = reclaim.aN[reclaim_idx];
-
- // get block id of raster header
- skc_block_id_t id = map[raster];
-
- //
- // load all of the head block ttxk.lo keys into registers
- //
- // FIXME -- this pattern lends itself to using the higher
- // performance Intel GEN block load instructions
- //
- skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
- //
- // pick out count.nodes and count.prims from the header
- //
- // load raster header counts -- we only need the blocks and
- // nodes words the keys are doublewords.
- //
- // FIXME -- this can be made portable with compile-time macro expansion
- //
- skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
- skc_uint count_nodes = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
-
-#if 0
- if (get_sub_group_local_id() == 0) {
- printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes);
- }
-#endif
- //
- // acquire a span in the block pool ids ring for reclaimed ids
- //
- skc_uint bp_ids_base = 0;
-
- if (get_sub_group_local_id() == 0) {
- bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
- }
-
- bp_ids_base = sub_group_broadcast(bp_ids_base,0);
-
- //
- // mask off everything but the block id
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
- h##I = h##I & SKC_TTXK_LO_MASK_ID; \
- }
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
- //
- // swap current id with next
- //
- if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
- {
- skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
-
- SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
-
- id = next;
-#if 0
- printf("rasters next = %u\n",id);
-#endif
- }
-
-#if 0
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- printf("%08X %u\n",h##I,h##I);
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-#endif
-
-#if 0
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
- printf("%08X\n",h##I); \
- }
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-#endif
-
- //
- // - we'll skip subgroups that are entirely header
- //
- // - but we need to mark any header elements that partially fill
- // a subgroup as subblocks
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
- if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) { \
- if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) { \
- h##I = SKC_UINT_MAX; \
- } \
- } \
- }
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
- {
- //
- // count reclaimable blocks in each lane
- //
- SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
- packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
- }
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
- //
- // scan to find index of each block
- //
- SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
-
- SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
-
- //
- // store blocks back to ring
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
- skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
- skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
- skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
- if (count > 0) { \
- bp_ids[bp_ids_idx] = h##I; \
- } \
- skc_uint const total = index + count; \
- bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
- }
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
- }
-
- // printf("R %7u ! %u\n",bp_ids_idx,h##I);
-
- //
- // we're done if it was just the header
- //
- if (count_nodes == 0)
- return;
-
- //
- // otherwise, walk the nodes
- //
- do {
- // id of next block is in last lane
- id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1);
-
- //
- // load all of the node block ttxk.lo keys into registers
- //
- // FIXME -- this pattern lends itself to using the higher
- // performance Intel GEN block load instructions
- //
- skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
- //
- // mask off everything but the block id
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- n##I = n##I & SKC_TTXK_LO_MASK_ID;
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
- //
- // swap current id with next
- //
- if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
- {
- skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
-
- SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
-
- id = next;
-#if 0
- printf("rasters next = %u\n",id);
-#endif
- }
-
-#if 0
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- printf("%08X %u\n",n##I,n##I);
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-#endif
-
- //
- // count reclaimable blocks in each lane
- //
- SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
- //
- // scan to find index of each block
- //
- SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
-
- SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
-
- //
- // store blocks back to ring
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) { \
- skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
- skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
- skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
- if (count > 0) { \
- bp_ids[bp_ids_idx] = n##I; \
- } \
- skc_uint const total = index + count; \
- bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
- }
-
- SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
- // printf("R %7u ! %u\n",bp_ids_idx,n##I);
-
- // any more nodes?
- } while (--count_nodes > 0);
- }
-}
-
-//
-//
-//
+/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "tile.h" +#include "block.h" +#include "raster.h" +#include "common.h" +#include "atomic_cl.h" +#include "block_pool_cl.h" +#include "kernel_cl_12.h" + +// +// +// + +#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) + +#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS) + +#define SKC_RASTERS_RECLAIM_X (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS) + +// +// +// + +#if ( SKC_RASTERS_RECLAIM_X == 1 ) +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1() +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 0 + +#elif ( SKC_RASTERS_RECLAIM_X == 2 ) +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2() +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 1 + +#elif ( SKC_RASTERS_RECLAIM_X == 4 ) +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4() +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 3 + +#elif ( SKC_RASTERS_RECLAIM_X == 8 ) +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8() +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 7 + +#elif ( SKC_RASTERS_RECLAIM_X == 16) +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16() +#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 15 + +#else +#error "MISSING SKC_RASTERS_RECLAIM_X" +#endif + +#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) + +#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L) +#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) +#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) + +#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1 + +#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) +#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) +#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \ + (I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK)) + +#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L) +#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) +#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) + +#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1 + +#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE) +#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask + +#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK)) +#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) +#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) + +#endif + +// +// FIXME -- slate these for replacement +// + +#define SKC_BROADCAST(E,S,I) \ + sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) + +#define SKC_BROADCAST_LAST_HELPER(E,I) \ + sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) + +#define SKC_BROADCAST_LAST(E,I) \ + SKC_BROADCAST_LAST_HELPER(E,I) + +// +// COMPILE-TIME PREDICATES +// + +#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I) \ + SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) + +#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I) \ + (skc_bool)SKC_GTE_MACRO(X, I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \ + (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) + +#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I) \ + SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I) + +#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I) \ + SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I) + +// +// RUN-TIME PREDICATES +// + +#define SKC_RASTERS_RECLAIM_IS_HEADER(I) \ + (get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS) + +// +// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL +// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK +// COMBOS (NOT NECESSARILY POW2) +// +// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR +// UINT TYPE INSTEAD OF A ULONG. +// + +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2 +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE skc_uint + +// +// +// + +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS) + +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \ + (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \ + ? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) + +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \ + S = sub_group_scan_exclusive_add(C) + +#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I) \ + (((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK) + +// +// +// + +struct skc_reclaim +{ + skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE]; +}; + +__kernel +SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS +void +skc_kernel_rasters_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring + __global skc_uint * const bp_elems, // block pool blocks + __global skc_uint volatile * const bp_atomics, // read/write atomics + skc_uint const bp_mask, // pow2 modulo mask for block pool ring + __global skc_block_id_t const * const map, // raster host-to-device map + struct skc_reclaim const reclaim) // array of host raster ids +{ +#if (__OPENCL_VERSION__ < 200) + skc_uint const reclaim_stride = get_num_sub_groups(); +#else + skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups +#endif + skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id(); + +#if 0 + // + // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT + // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL + // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE + // RECLAMATION JOB ON THE REST OF THE PIPELINE. + // + for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride) +#endif + { + // get host raster id + skc_raster_h const raster = reclaim.aN[reclaim_idx]; + + // get block id of raster header + skc_block_id_t id = map[raster]; + + // + // load all of the head block ttxk.lo keys into registers + // + // FIXME -- this pattern lends itself to using the higher + // performance Intel GEN block load instructions + // + skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id()); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)]; + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // pick out count.nodes and count.prims from the header + // + // load raster header counts -- we only need the blocks and + // nodes words the keys are doublewords. + // + // FIXME -- this can be made portable with compile-time macro expansion + // + skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES + skc_uint count_nodes = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS + +#if 0 + if (get_sub_group_local_id() == 0) { + printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes); + } +#endif + // + // acquire a span in the block pool ids ring for reclaimed ids + // + skc_uint bp_ids_base = 0; + + if (get_sub_group_local_id() == 0) { + bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks); + } + + bp_ids_base = sub_group_broadcast(bp_ids_base,0); + + // + // mask off everything but the block id + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ + h##I = h##I & SKC_TTXK_LO_MASK_ID; \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // swap current id with next + // + if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) + { + skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST); + + SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; + + id = next; +#if 0 + printf("rasters next = %u\n",id); +#endif + } + +#if 0 +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + printf("%08X %u\n",h##I,h##I); + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); +#endif + +#if 0 +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ + printf("%08X\n",h##I); \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); +#endif + + // + // - we'll skip subgroups that are entirely header + // + // - but we need to mark any header elements that partially fill + // a subgroup as subblocks + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ + if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) { \ + if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) { \ + h##I = SKC_UINT_MAX; \ + } \ + } \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + { + // + // count reclaimable blocks in each lane + // + SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ + packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // scan to find index of each block + // + SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); + + SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); + + // + // store blocks back to ring + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \ + skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ + skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ + skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ + if (count > 0) { \ + bp_ids[bp_ids_idx] = h##I; \ + } \ + skc_uint const total = index + count; \ + bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + } + + // printf("R %7u ! %u\n",bp_ids_idx,h##I); + + // + // we're done if it was just the header + // + if (count_nodes == 0) + return; + + // + // otherwise, walk the nodes + // + do { + // id of next block is in last lane + id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); + + // + // load all of the node block ttxk.lo keys into registers + // + // FIXME -- this pattern lends itself to using the higher + // performance Intel GEN block load instructions + // + skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id()); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)]; + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // mask off everything but the block id + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + n##I = n##I & SKC_TTXK_LO_MASK_ID; + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // swap current id with next + // + if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1) + { + skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST); + + SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; + + id = next; +#if 0 + printf("rasters next = %u\n",id); +#endif + } + +#if 0 +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + printf("%08X %u\n",n##I,n##I); + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); +#endif + + // + // count reclaimable blocks in each lane + // + SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I); + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // + // scan to find index of each block + // + SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); + + SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); + + // + // store blocks back to ring + // +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) { \ + skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ + skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ + skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ + if (count > 0) { \ + bp_ids[bp_ids_idx] = n##I; \ + } \ + skc_uint const total = index + count; \ + bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \ + } + + SKC_RASTERS_RECLAIM_BLOCK_EXPAND(); + + // printf("R %7u ! %u\n",bp_ids_idx,n##I); + + // any more nodes? + } while (--count_nodes > 0); + } +} + +// +// +// |