aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl
diff options
context:
space:
mode:
Diffstat (limited to 'src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl')
-rw-r--r--src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl780
1 files changed, 390 insertions, 390 deletions
diff --git a/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl b/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl
index 2aee5dac17..5441dcdec7 100644
--- a/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl
@@ -1,390 +1,390 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-// FIXME -- a pre-allocation step could load the path header quads and
-// total up the number of blocks in the workgroup or subgroup
-// minimizing the number of later atomics adds.
-//
-
-#include "block.h"
-#include "path.h"
-#include "common.h"
-#include "atomic_cl.h"
-#include "block_pool_cl.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
-
-#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS)
-
-#define SKC_PATHS_RECLAIM_X (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS)
-
-//
-//
-//
-
-#if ( SKC_PATHS_RECLAIM_X == 1 )
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 0
-
-#elif ( SKC_PATHS_RECLAIM_X == 2 )
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 1
-
-#elif ( SKC_PATHS_RECLAIM_X == 4 )
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 3
-
-#elif ( SKC_PATHS_RECLAIM_X == 8 )
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 7
-
-#elif ( SKC_PATHS_RECLAIM_X == 16)
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 15
-
-#else
-#error "MISSING SKC_PATHS_RECLAIM_X"
-#endif
-
-//
-// FIXME -- slate these for replacement
-//
-
-#define SKC_BROADCAST(E,S,I) \
- sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_BROADCAST_LAST_HELPER(E,I) \
- sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
-
-#define SKC_BROADCAST_LAST(E,I) \
- SKC_BROADCAST_LAST_HELPER(E,I)
-
-//
-// COMPILE-TIME PREDICATES
-//
-
-#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I) \
- SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I) \
- (skc_bool)SKC_GTE_MACRO(X, I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) && \
- (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I) \
- SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I)
-
-#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I) \
- SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I)
-
-//
-// RUN-TIME PREDICATES
-//
-
-#define SKC_PATHS_RECLAIM_IS_HEADER(I) \
- (get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS)
-
-//
-// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
-// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
-// COMBOS (NOT NECESSARILY POW2)
-//
-// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
-// UINT TYPE INSTEAD OF A ULONG.
-//
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE skc_uint
-
-//
-//
-//
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS)
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \
- (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \
- ? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I))
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \
- S = sub_group_scan_exclusive_add(C)
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I) \
- (((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK)
-
-//
-//
-//
-
-struct skc_reclaim
-{
- skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE];
-};
-
-__kernel
-SKC_PATHS_RECLAIM_KERNEL_ATTRIBS
-void
-skc_kernel_paths_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring
- __global skc_uint * const bp_elems, // block pool blocks
- __global skc_uint volatile * const bp_atomics, // read/write atomics
- skc_uint const bp_mask, // pow2 modulo mask for block pool ring
- __global skc_block_id_t const * const map, // path host-to-device map
- struct skc_reclaim const reclaim) // array of host path ids
-{
-#if (__OPENCL_VERSION__ < 200)
- skc_uint const reclaim_stride = get_num_sub_groups();
-#else
- skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
-#endif
- skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id();
-
-#if 0
- //
- // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
- // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
- // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
- // RECLAMATION JOB ON THE REST OF THE PIPELINE.
- //
- for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
-#endif
- {
- // get host path id
- skc_path_h const path = reclaim.aN[reclaim_idx];
-
- // get the path header block from the map
- skc_block_id_t id = map[path];
-
- //
- // blindly load all of the head elements into registers
- //
- skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- //
- // pick out count.nodes and count.prims from the header
- //
- skc_uint count_blocks, count_nodes;
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
- count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
- } \
- if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
- count_nodes = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \
- }
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-#if 0
- if (get_sub_group_local_id() == 0) {
- printf("reclaim paths: %9u / %5u / %5u\n",path,count_blocks,count_nodes);
- }
-#endif
-
- //
- // acquire a span in the block pool ids ring for reclaimed ids
- //
- // FIXME count_blocks and atomic add can be done in same lane
- //
- skc_uint bp_ids_base = 0;
-
- if (get_sub_group_local_id() == 0) {
- bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
-
-#if 0
- printf("paths: bp_ids_base = %u\n",bp_ids_base);
-#endif
- }
-
- bp_ids_base = sub_group_broadcast(bp_ids_base,0);
-
- //
- // shift away the tagged block id's tag
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
- h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; \
- }
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- //
- // swap current id with next
- //
- if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
- {
- skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
-
- SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
-
- id = next;
- }
-
- //
- // - we'll skip subgroups that are entirely header
- //
- // - but we need to mark any header elements that partially fill
- // a subgroup as invalid tagged block ids
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
- if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) { \
- if (SKC_PATHS_RECLAIM_IS_HEADER(I)) { \
- h##I = SKC_TAGGED_BLOCK_ID_INVALID; \
- } \
- } \
- }
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- {
- //
- // count reclaimable blocks in each lane
- //
- SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
- packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
- }
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- //
- // scan to find index of each block
- //
- SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
-
- SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
-
- //
- // store blocks back to ring
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
- skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
- skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
- skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
- if (count > 0) { \
- bp_ids[bp_ids_idx] = h##I; \
- } \
- skc_uint const total = index + count; \
- bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
- }
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- // printf("P %7u ! %u\n",bp_ids_idx,h##I);
- }
-
- //
- // we're done if it was just the header
- //
- if (count_nodes == 0)
- return;
-
- //
- // otherwise, walk the nodes
- //
- do {
- // id of next block is in last lane
- id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1);
-
- // get index of each element
- skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
-
- //
- // blindly load all of the node elements into registers
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- //
- // shift away the tagged block id's tag
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- //
- // swap current id with next
- //
- if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
- {
- skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
-
- SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
-
- id = next;
- }
-
- //
- // count reclaimable blocks in each lane
- //
- SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- //
- // scan to find index of each block
- //
- SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
-
- SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
-
- //
- // store blocks back to ring
- //
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) { \
- skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
- skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
- skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
- if (count > 0) { \
- bp_ids[bp_ids_idx] = n##I; \
- } \
- skc_uint const total = index + count; \
- bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
- }
-
- SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
- // printf("P %7u ! %u\n",bp_ids_idx,n##I);
-
- // any more nodes?
- } while (--count_nodes > 0);
- }
-}
-
-//
-//
-//
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+// FIXME -- a pre-allocation step could load the path header quads and
+// total up the number of blocks in the workgroup or subgroup
+// minimizing the number of later atomics adds.
+//
+
+#include "block.h"
+#include "path.h"
+#include "common.h"
+#include "atomic_cl.h"
+#include "block_pool_cl.h"
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS)
+
+#define SKC_PATHS_RECLAIM_X (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS)
+
+//
+//
+//
+
+#if ( SKC_PATHS_RECLAIM_X == 1 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 0
+
+#elif ( SKC_PATHS_RECLAIM_X == 2 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 1
+
+#elif ( SKC_PATHS_RECLAIM_X == 4 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 3
+
+#elif ( SKC_PATHS_RECLAIM_X == 8 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 7
+
+#elif ( SKC_PATHS_RECLAIM_X == 16)
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 15
+
+#else
+#error "MISSING SKC_PATHS_RECLAIM_X"
+#endif
+
+//
+// FIXME -- slate these for replacement
+//
+
+#define SKC_BROADCAST(E,S,I) \
+ sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_BROADCAST_LAST_HELPER(E,I) \
+ sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_BROADCAST_LAST(E,I) \
+ SKC_BROADCAST_LAST_HELPER(E,I)
+
+//
+// COMPILE-TIME PREDICATES
+//
+
+#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I) \
+ SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I) \
+ (skc_bool)SKC_GTE_MACRO(X, I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) && \
+ (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I) \
+ SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I)
+
+#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I) \
+ SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I)
+
+//
+// RUN-TIME PREDICATES
+//
+
+#define SKC_PATHS_RECLAIM_IS_HEADER(I) \
+ (get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS)
+
+//
+// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
+// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
+// COMBOS (NOT NECESSARILY POW2)
+//
+// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
+// UINT TYPE INSTEAD OF A ULONG.
+//
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE skc_uint
+
+//
+//
+//
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS)
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \
+ (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \
+ ? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I))
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \
+ S = sub_group_scan_exclusive_add(C)
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I) \
+ (((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK)
+
+//
+//
+//
+
+struct skc_reclaim
+{
+ skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE];
+};
+
+__kernel
+SKC_PATHS_RECLAIM_KERNEL_ATTRIBS
+void
+skc_kernel_paths_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring
+ __global skc_uint * const bp_elems, // block pool blocks
+ __global skc_uint volatile * const bp_atomics, // read/write atomics
+ skc_uint const bp_mask, // pow2 modulo mask for block pool ring
+ __global skc_block_id_t const * const map, // path host-to-device map
+ struct skc_reclaim const reclaim) // array of host path ids
+{
+#if (__OPENCL_VERSION__ < 200)
+ skc_uint const reclaim_stride = get_num_sub_groups();
+#else
+ skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
+#endif
+ skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id();
+
+#if 0
+ //
+ // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
+ // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
+ // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
+ // RECLAMATION JOB ON THE REST OF THE PIPELINE.
+ //
+ for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
+#endif
+ {
+ // get host path id
+ skc_path_h const path = reclaim.aN[reclaim_idx];
+
+ // get the path header block from the map
+ skc_block_id_t id = map[path];
+
+ //
+ // blindly load all of the head elements into registers
+ //
+ skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // pick out count.nodes and count.prims from the header
+ //
+ skc_uint count_blocks, count_nodes;
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
+ count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
+ } \
+ if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
+ count_nodes = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \
+ }
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+#if 0
+ if (get_sub_group_local_id() == 0) {
+ printf("reclaim paths: %9u / %5u / %5u\n",path,count_blocks,count_nodes);
+ }
+#endif
+
+ //
+ // acquire a span in the block pool ids ring for reclaimed ids
+ //
+ // FIXME count_blocks and atomic add can be done in same lane
+ //
+ skc_uint bp_ids_base = 0;
+
+ if (get_sub_group_local_id() == 0) {
+ bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
+
+#if 0
+ printf("paths: bp_ids_base = %u\n",bp_ids_base);
+#endif
+ }
+
+ bp_ids_base = sub_group_broadcast(bp_ids_base,0);
+
+ //
+ // shift away the tagged block id's tag
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
+ h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; \
+ }
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // swap current id with next
+ //
+ if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+ {
+ skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+ SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+ id = next;
+ }
+
+ //
+ // - we'll skip subgroups that are entirely header
+ //
+ // - but we need to mark any header elements that partially fill
+ // a subgroup as invalid tagged block ids
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
+ if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) { \
+ if (SKC_PATHS_RECLAIM_IS_HEADER(I)) { \
+ h##I = SKC_TAGGED_BLOCK_ID_INVALID; \
+ } \
+ } \
+ }
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ {
+ //
+ // count reclaimable blocks in each lane
+ //
+ SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
+ packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
+ }
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // scan to find index of each block
+ //
+ SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+ SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+ //
+ // store blocks back to ring
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
+ skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+ skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+ skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
+ if (count > 0) { \
+ bp_ids[bp_ids_idx] = h##I; \
+ } \
+ skc_uint const total = index + count; \
+ bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
+ }
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ // printf("P %7u ! %u\n",bp_ids_idx,h##I);
+ }
+
+ //
+ // we're done if it was just the header
+ //
+ if (count_nodes == 0)
+ return;
+
+ //
+ // otherwise, walk the nodes
+ //
+ do {
+ // id of next block is in last lane
+ id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1);
+
+ // get index of each element
+ skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+ //
+ // blindly load all of the node elements into registers
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // shift away the tagged block id's tag
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // swap current id with next
+ //
+ if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+ {
+ skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+ SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+ id = next;
+ }
+
+ //
+ // count reclaimable blocks in each lane
+ //
+ SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ //
+ // scan to find index of each block
+ //
+ SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+ SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+ //
+ // store blocks back to ring
+ //
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) { \
+ skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+ skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+ skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
+ if (count > 0) { \
+ bp_ids[bp_ids_idx] = n##I; \
+ } \
+ skc_uint const total = index + count; \
+ bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
+ }
+
+ SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+ // printf("P %7u ! %u\n",bp_ids_idx,n##I);
+
+ // any more nodes?
+ } while (--count_nodes > 0);
+ }
+}
+
+//
+//
+//