aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl
diff options
context:
space:
mode:
Diffstat (limited to 'src/compute/skc/platforms/cl_12/kernels/paths_copy.cl')
-rw-r--r--src/compute/skc/platforms/cl_12/kernels/paths_copy.cl1086
1 files changed, 543 insertions, 543 deletions
diff --git a/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl b/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl
index 302ea14af2..63a1a43177 100644
--- a/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl
@@ -1,543 +1,543 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "path.h"
-#include "block_pool_cl.h"
-#include "path_builder_cl_12.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#if 0
-
-//
-// SIMD AVX2
-//
-
-#define SKC_PATHS_COPY_WORDS_PER_ELEM 8
-#define SKC_PATHS_COPY_SUBGROUP_SIZE 1
-#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES
-
-typedef skc_uint8 skc_paths_copy_elem;
-typedef skc_uint8 skc_pb_idx_v;
-
-#define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_8()
-
-#define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + I >= SKC_PATH_HEAD_WORDS)
-
-#endif
-
-//
-//
-//
-
-#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)
-#define SKC_PATHS_COPY_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
-#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
-#define SKC_PATHS_COPY_ELEMS_PER_THREAD (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)
-
-// FIXME -- use SUBGROUP terminology everywhere
-#define SKC_PATHS_COPY_SUBGROUP_WORDS (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)
-
-//
-//
-//
-
-#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER \
- (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))
-
-#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER \
- (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))
-
-// #define SKC_PATHS_COPY_HEAD_ELEMS ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)
-
-//
-//
-//
-
-//
-// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL
-//
-
-#define SKC_CMD_PATHS_COPY_ONE_BITS (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
-
-#define SKC_CMD_PATHS_COPY_ONE_MASK SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)
-
-#define SKC_CMD_PATHS_COPY_ONE (1u << SKC_CMD_PATHS_COPY_ONE_BITS)
-
-#define SKC_CMD_PATHS_COPY_GET_TAG(ti) SKC_TAGGED_BLOCK_ID_GET_TAG(ti)
-
-#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti) ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)
-
-#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b) (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))
-
-//
-//
-//
-
-skc_uint
-skc_sub_group_local_id()
-{
-#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
- return get_sub_group_local_id();
-#else
- return 0;
-#endif
-}
-
-//
-// convert an atomic read counter offset to a block id
-//
-
-skc_block_id_t
-skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,
- skc_uint const bp_idx_mask,
- skc_uint const bp_reads,
- skc_uint const bp_off)
-{
- skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;
-
- return bp_ids[bp_idx];
-}
-
-//
-//
-//
-
-void
-skc_copy_segs(__global skc_paths_copy_elem * const bp_elems, // to
- skc_uint const bp_elems_idx,
- __global skc_paths_copy_elem const * const pb_elems, // from
- skc_uint const pb_elems_idx)
-{
- for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
- {
- (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];
- }
-
-#if 0
- //
- // NOTE THIS IS PRINTING 8 ROWS
- //
- printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
- (skc_uint)get_global_id(0),pb_elems_idx,
- as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));
- printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
- (skc_uint)get_global_id(0),pb_elems_idx,
- as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));
-#endif
-}
-
-//
-//
-//
-
-void
-skc_copy_node(__global skc_paths_copy_elem * const bp_elems, // to
- skc_uint const bp_elems_idx,
- __global skc_block_id_t const * const bp_ids,
- skc_uint const bp_reads,
- skc_uint const bp_idx_mask,
- __global skc_paths_copy_elem const * const pb_elems, // from
- skc_uint const pb_elems_idx,
- skc_uint const pb_rolling)
-{
- //
- // remap block id tags bp_elems the host-side rolling counter pb_elems a
- // device-side block pool id
- //
- for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
- {
- // load block_id_tag words
- skc_paths_copy_elem elem = (pb_elems + pb_elems_idx)[ii];
-
- // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
- skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
-
- // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
-
- //
- // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
- // will _always_ be safe as long as we don't use the loaded
- // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
- // of iterating over the vector components.
- //
-
- // only convert if original elem is not invalid
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
- skc_block_id_t const b = bp_ids[bp_idx C]; \
- elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
- }
-
- // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);
-
- SKC_PATHS_COPY_ELEM_EXPAND();
-
- // store the elem back
- (bp_elems+bp_elems_idx)[ii] = elem;
- }
-}
-
-//
-//
-//
-
-void
-skc_host_map_update(__global skc_uint * const host_map,
- skc_uint const block,
- skc_paths_copy_elem const elem)
-{
- //
- // write first elem to map -- FIXME -- this is a little nasty
- // because it relies on the the host handle always being the first
- // word in the path header.
- //
- // OTOH, this is not unreasonable. The alternative is to have a
- // separate kernel initializing the map.
- //
-#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
- if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)
-#endif
- {
-#if SKC_PATHS_COPY_ELEM_WORDS == 1
- host_map[elem] = block;
-#if 0
- printf("[%u] = %u\n",elem,block);
-#endif
-#else
- host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;
-#endif
- }
-}
-
-//
-//
-//
-
-void
-skc_copy_head(__global skc_uint * const host_map,
- skc_uint const block,
- __global skc_paths_copy_elem * const bp_elems, // to
- skc_uint const bp_elems_idx,
- __global skc_block_id_t const * const bp_ids,
- skc_uint const bp_reads,
- skc_uint const bp_idx_mask,
- __global skc_paths_copy_elem const * const pb_elems, // from
- skc_uint const pb_elems_idx,
- skc_uint const pb_rolling)
-{
- //
- // if there are more path header words than there are
- // threads-per-block then we can just copy the initial header words
- //
-#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )
- for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
- {
- skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];
-
- (bp_elems+bp_elems_idx)[ii] = elem;
-
- if (ii == 0) {
- skc_host_map_update(host_map,block,elem);
- }
- }
-#endif
-
- //
- // this is similar to copy node but the first H words of the path
- // header are not modified and simply copied
- //
- for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
- {
- skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
-
-#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )
- if (ii == 0) {
- skc_host_map_update(host_map,block,elem);
- }
-#endif
- // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
- skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
-
- //
- // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
- // will _always_ be safe as long as we don't use the loaded
- // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
- // of iterating over the vector components.
- //
-
- // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
-
- // FIXME -- MIX MIX MIX MIX / SELECT
-
- // only convert if original elem is not invalid
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \
- skc_block_id_t const b = bp_ids[bp_idx C]; \
- elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
- }
-
- // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);
-
- SKC_PATHS_COPY_ELEM_EXPAND();
-
- // store the elem back
- (bp_elems+bp_elems_idx)[ii] = elem;
- }
-
- //
- // the remaining words are treated like a node
- //
- for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
- {
- // load block_id_tag words
- skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
-
- // calculate ahead of time
- skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
-
- //
- // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
- // will _always_ be safe as long as we don't use the loaded
- // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
- // of iterating over the vector components.
- //
-
- // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
-
- // only convert if original elem is not invalid
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
- skc_block_id_t const b = bp_ids[bp_idx C]; \
- elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
- }
-
- // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);
-
- SKC_PATHS_COPY_ELEM_EXPAND();
-
- // store the elem
- (bp_elems+bp_elems_idx)[ii] = elem;
- }
-}
-
-//
-// FIXME -- pack some of these constant integer args in a vec or struct
-//
-
-__kernel
-SKC_PATHS_COPY_KERNEL_ATTRIBS
-void
-skc_kernel_paths_copy
-(__global skc_uint * const host_map,
-
- __global skc_block_id_t const * const bp_ids,
- __global skc_paths_copy_elem * const bp_elems,
- skc_uint const bp_idx_mask, // pow2 modulo mask for block pool ring
-
- __global skc_uint const * const bp_alloc, // block pool ring base
- skc_uint const bp_alloc_idx,// which subbuf
-
- __global union skc_tagged_block_id const * const pb_cmds,
- __global skc_paths_copy_elem const * const pb_elems,
-
- skc_uint const pb_size, // # of commands/blocks in buffer
- skc_uint const pb_rolling, // shifted rolling counter base
-
- skc_uint const pb_prev_from,
- skc_uint const pb_prev_span,
- skc_uint const pb_curr_from)
-{
- //
- // THERE ARE 3 TYPES OF PATH COPYING COMMANDS:
- //
- // - HEAD
- // - NODE
- // - SEGS
- //
- // THESE ARE SUBGROUP ORIENTED KERNELS
- //
- // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS
- //
-
- //
- // It's likely that peak bandwidth is achievable with a single
- // workgroup.
- //
- // So let's keep the grids modestly sized and for simplicity and
- // portability, let's assume that a single workgroup can perform all
- // steps in the copy.
- //
- // Launch as large of a workgroup as possiblex
- //
- // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL
- // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS
- // 3. FOR EACH COMMAND:
- // - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.
- // - NODE: CONVERT AND COPY B INDICES
- // - SEGS: BULK COPY
- //
- // B : number of words in block -- always pow2
- // W : intelligently/arbitrarily chosen factor of B -- always pow2
- //
-
- //
- // There are several approaches to processing the commands:
- //
- // 1. B threads are responsible for one block. All threads broadcast
- // load a single command word. Workgroup size must be a facpb_elemsr of
- // B.
- //
- // 2. W threads process an entire block. W will typically be the
- // device's subgroup/warp/wave width. W threads broadcast load a
- // single command word.
- //
- // 3. W threads process W blocks. W threads load W command words and
- // process W blocks.
- //
- // Clearly (1) has low I/O intensity but will achieve high
- // parallelism by activating the most possible threads. The downside
- // of this kind of approach is that the kernel will occupy even a
- // large GPU with low intensity work and reduce opportunities for
- // concurrent kernel execution (of other kernels).
- //
- // See Vasily Volkov's CUDA presentation describing these tradeoffs.
- //
- // Note that there are many other approaches. For example, similar
- // pb_elems (1) but each thread loads a pow2 vector of block data.
- //
-
- // load the copied atomic read "base" from gmem
- skc_uint const bp_reads = bp_alloc[bp_alloc_idx];
- // will always be less than 2^32
- skc_uint const gid = get_global_id(0);
- // every subgroup/simd that will work on the block loads the same command
- skc_uint const sg_idx = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;
- // path builder data can be spread across two spans
- skc_uint pb_idx = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);
-
- // no need pb_elems make this branchless
- if (pb_idx >= pb_size)
- pb_idx -= pb_size;
-
- // broadcast load the command
- union skc_tagged_block_id const pb_cmd = pb_cmds[pb_idx];
-
- // what do we want pb_elems do with this block?
- skc_cmd_paths_copy_tag const tag = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);
-
- // compute offset from rolling base to get index into block pool ring allocation
- skc_uint const bp_off = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);
-
- // convert the pb_cmd's offset counter pb_elems a block id
- skc_block_id_t const block = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);
-
-#if 0
- if (get_sub_group_local_id() == 0) {
- printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);
- printf("< %8u >\n",block);
- }
-#endif
-
- // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()
- skc_uint const tid = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;
-
- // calculate bp_elems (to) / pb_elems (from)
- skc_uint const bp_elems_idx = block * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;
- skc_uint const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK + tid;
-
- if (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)
- {
-#if 0
- if (tid == 0)
- printf("%3u, segs\n",bp_off);
-#endif
- skc_copy_segs(bp_elems,
- bp_elems_idx,
- pb_elems,
- pb_elems_idx);
- }
- else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)
- {
-#if 0
- if (tid == 0)
- printf("%3u, NODE\n",bp_off);
-#endif
- skc_copy_node(bp_elems, // to
- bp_elems_idx,
- bp_ids,
- bp_reads,
- bp_idx_mask,
- pb_elems, // from
- pb_elems_idx,
- pb_rolling);
- }
- else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)
- {
-#if 0
- if (tid == 0)
- printf("%3u, HEAD\n",bp_off);
-#endif
- skc_copy_head(host_map,
- block,
- bp_elems, // to
- bp_elems_idx,
- bp_ids,
- bp_reads,
- bp_idx_mask,
- pb_elems, // from
- pb_elems_idx,
- pb_rolling);
- }
-}
-
-//
-//
-//
-
-__kernel
-SKC_PATHS_ALLOC_KERNEL_ATTRIBS
-void
-skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,
- __global skc_uint * const bp_alloc,
- skc_uint const bp_alloc_idx,
- skc_uint const pb_cmd_count)
-{
- //
- // allocate blocks in block pool
- //
- skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);
-
- // store in slot
- bp_alloc[bp_alloc_idx] = reads;
-
-#if 0
- printf("pc: %8u + %u\n",reads,pb_cmd_count);
-#endif
-}
-
-//
-//
-//
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "path.h"
+#include "block_pool_cl.h"
+#include "path_builder_cl_12.h"
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#if 0
+
+//
+// SIMD AVX2
+//
+
+#define SKC_PATHS_COPY_WORDS_PER_ELEM 8
+#define SKC_PATHS_COPY_SUBGROUP_SIZE 1
+#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES
+
+typedef skc_uint8 skc_paths_copy_elem;
+typedef skc_uint8 skc_pb_idx_v;
+
+#define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_8()
+
+#define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + I >= SKC_PATH_HEAD_WORDS)
+
+#endif
+
+//
+//
+//
+
+#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)
+#define SKC_PATHS_COPY_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
+#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
+#define SKC_PATHS_COPY_ELEMS_PER_THREAD (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)
+
+// FIXME -- use SUBGROUP terminology everywhere
+#define SKC_PATHS_COPY_SUBGROUP_WORDS (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)
+
+//
+//
+//
+
+#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER \
+ (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))
+
+#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER \
+ (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))
+
+// #define SKC_PATHS_COPY_HEAD_ELEMS ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)
+
+//
+//
+//
+
+//
+// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL
+//
+
+#define SKC_CMD_PATHS_COPY_ONE_BITS (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
+
+#define SKC_CMD_PATHS_COPY_ONE_MASK SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)
+
+#define SKC_CMD_PATHS_COPY_ONE (1u << SKC_CMD_PATHS_COPY_ONE_BITS)
+
+#define SKC_CMD_PATHS_COPY_GET_TAG(ti) SKC_TAGGED_BLOCK_ID_GET_TAG(ti)
+
+#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti) ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)
+
+#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b) (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))
+
+//
+//
+//
+
+skc_uint
+skc_sub_group_local_id()
+{
+#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
+ return get_sub_group_local_id();
+#else
+ return 0;
+#endif
+}
+
+//
+// convert an atomic read counter offset to a block id
+//
+
+skc_block_id_t
+skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,
+ skc_uint const bp_idx_mask,
+ skc_uint const bp_reads,
+ skc_uint const bp_off)
+{
+ skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;
+
+ return bp_ids[bp_idx];
+}
+
+//
+//
+//
+
+void
+skc_copy_segs(__global skc_paths_copy_elem * const bp_elems, // to
+ skc_uint const bp_elems_idx,
+ __global skc_paths_copy_elem const * const pb_elems, // from
+ skc_uint const pb_elems_idx)
+{
+ for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+ {
+ (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];
+ }
+
+#if 0
+ //
+ // NOTE THIS IS PRINTING 8 ROWS
+ //
+ printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
+ (skc_uint)get_global_id(0),pb_elems_idx,
+ as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+ as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+ as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+ as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));
+ printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
+ (skc_uint)get_global_id(0),pb_elems_idx,
+ as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+ as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+ as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+ as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));
+#endif
+}
+
+//
+//
+//
+
+void
+skc_copy_node(__global skc_paths_copy_elem * const bp_elems, // to
+ skc_uint const bp_elems_idx,
+ __global skc_block_id_t const * const bp_ids,
+ skc_uint const bp_reads,
+ skc_uint const bp_idx_mask,
+ __global skc_paths_copy_elem const * const pb_elems, // from
+ skc_uint const pb_elems_idx,
+ skc_uint const pb_rolling)
+{
+ //
+ // remap block id tags bp_elems the host-side rolling counter pb_elems a
+ // device-side block pool id
+ //
+ for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+ {
+ // load block_id_tag words
+ skc_paths_copy_elem elem = (pb_elems + pb_elems_idx)[ii];
+
+ // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
+ skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
+
+ // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
+
+ //
+ // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
+ // will _always_ be safe as long as we don't use the loaded
+ // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
+ // of iterating over the vector components.
+ //
+
+ // only convert if original elem is not invalid
+
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
+ skc_block_id_t const b = bp_ids[bp_idx C]; \
+ elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
+ }
+
+ // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);
+
+ SKC_PATHS_COPY_ELEM_EXPAND();
+
+ // store the elem back
+ (bp_elems+bp_elems_idx)[ii] = elem;
+ }
+}
+
+//
+//
+//
+
+void
+skc_host_map_update(__global skc_uint * const host_map,
+ skc_uint const block,
+ skc_paths_copy_elem const elem)
+{
+ //
+ // write first elem to map -- FIXME -- this is a little nasty
+ // because it relies on the the host handle always being the first
+ // word in the path header.
+ //
+ // OTOH, this is not unreasonable. The alternative is to have a
+ // separate kernel initializing the map.
+ //
+#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
+ if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)
+#endif
+ {
+#if SKC_PATHS_COPY_ELEM_WORDS == 1
+ host_map[elem] = block;
+#if 0
+ printf("[%u] = %u\n",elem,block);
+#endif
+#else
+ host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;
+#endif
+ }
+}
+
+//
+//
+//
+
+void
+skc_copy_head(__global skc_uint * const host_map,
+ skc_uint const block,
+ __global skc_paths_copy_elem * const bp_elems, // to
+ skc_uint const bp_elems_idx,
+ __global skc_block_id_t const * const bp_ids,
+ skc_uint const bp_reads,
+ skc_uint const bp_idx_mask,
+ __global skc_paths_copy_elem const * const pb_elems, // from
+ skc_uint const pb_elems_idx,
+ skc_uint const pb_rolling)
+{
+ //
+ // if there are more path header words than there are
+ // threads-per-block then we can just copy the initial header words
+ //
+#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )
+ for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+ {
+ skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];
+
+ (bp_elems+bp_elems_idx)[ii] = elem;
+
+ if (ii == 0) {
+ skc_host_map_update(host_map,block,elem);
+ }
+ }
+#endif
+
+ //
+ // this is similar to copy node but the first H words of the path
+ // header are not modified and simply copied
+ //
+ for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+ {
+ skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
+
+#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )
+ if (ii == 0) {
+ skc_host_map_update(host_map,block,elem);
+ }
+#endif
+ // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
+ skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
+
+ //
+ // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
+ // will _always_ be safe as long as we don't use the loaded
+ // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
+ // of iterating over the vector components.
+ //
+
+ // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
+
+ // FIXME -- MIX MIX MIX MIX / SELECT
+
+ // only convert if original elem is not invalid
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \
+ skc_block_id_t const b = bp_ids[bp_idx C]; \
+ elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
+ }
+
+ // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);
+
+ SKC_PATHS_COPY_ELEM_EXPAND();
+
+ // store the elem back
+ (bp_elems+bp_elems_idx)[ii] = elem;
+ }
+
+ //
+ // the remaining words are treated like a node
+ //
+ for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+ {
+ // load block_id_tag words
+ skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
+
+ // calculate ahead of time
+ skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
+
+ //
+ // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
+ // will _always_ be safe as long as we don't use the loaded
+ // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
+ // of iterating over the vector components.
+ //
+
+ // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
+
+ // only convert if original elem is not invalid
+#undef SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) \
+ if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
+ skc_block_id_t const b = bp_ids[bp_idx C]; \
+ elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
+ }
+
+ // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);
+
+ SKC_PATHS_COPY_ELEM_EXPAND();
+
+ // store the elem
+ (bp_elems+bp_elems_idx)[ii] = elem;
+ }
+}
+
+//
+// FIXME -- pack some of these constant integer args in a vec or struct
+//
+
+__kernel
+SKC_PATHS_COPY_KERNEL_ATTRIBS
+void
+skc_kernel_paths_copy
+(__global skc_uint * const host_map,
+
+ __global skc_block_id_t const * const bp_ids,
+ __global skc_paths_copy_elem * const bp_elems,
+ skc_uint const bp_idx_mask, // pow2 modulo mask for block pool ring
+
+ __global skc_uint const * const bp_alloc, // block pool ring base
+ skc_uint const bp_alloc_idx,// which subbuf
+
+ __global union skc_tagged_block_id const * const pb_cmds,
+ __global skc_paths_copy_elem const * const pb_elems,
+
+ skc_uint const pb_size, // # of commands/blocks in buffer
+ skc_uint const pb_rolling, // shifted rolling counter base
+
+ skc_uint const pb_prev_from,
+ skc_uint const pb_prev_span,
+ skc_uint const pb_curr_from)
+{
+ //
+ // THERE ARE 3 TYPES OF PATH COPYING COMMANDS:
+ //
+ // - HEAD
+ // - NODE
+ // - SEGS
+ //
+ // THESE ARE SUBGROUP ORIENTED KERNELS
+ //
+ // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS
+ //
+
+ //
+ // It's likely that peak bandwidth is achievable with a single
+ // workgroup.
+ //
+ // So let's keep the grids modestly sized and for simplicity and
+ // portability, let's assume that a single workgroup can perform all
+ // steps in the copy.
+ //
+ // Launch as large of a workgroup as possiblex
+ //
+ // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL
+ // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS
+ // 3. FOR EACH COMMAND:
+ // - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.
+ // - NODE: CONVERT AND COPY B INDICES
+ // - SEGS: BULK COPY
+ //
+ // B : number of words in block -- always pow2
+ // W : intelligently/arbitrarily chosen factor of B -- always pow2
+ //
+
+ //
+ // There are several approaches to processing the commands:
+ //
+ // 1. B threads are responsible for one block. All threads broadcast
+ // load a single command word. Workgroup size must be a facpb_elemsr of
+ // B.
+ //
+ // 2. W threads process an entire block. W will typically be the
+ // device's subgroup/warp/wave width. W threads broadcast load a
+ // single command word.
+ //
+ // 3. W threads process W blocks. W threads load W command words and
+ // process W blocks.
+ //
+ // Clearly (1) has low I/O intensity but will achieve high
+ // parallelism by activating the most possible threads. The downside
+ // of this kind of approach is that the kernel will occupy even a
+ // large GPU with low intensity work and reduce opportunities for
+ // concurrent kernel execution (of other kernels).
+ //
+ // See Vasily Volkov's CUDA presentation describing these tradeoffs.
+ //
+ // Note that there are many other approaches. For example, similar
+ // pb_elems (1) but each thread loads a pow2 vector of block data.
+ //
+
+ // load the copied atomic read "base" from gmem
+ skc_uint const bp_reads = bp_alloc[bp_alloc_idx];
+ // will always be less than 2^32
+ skc_uint const gid = get_global_id(0);
+ // every subgroup/simd that will work on the block loads the same command
+ skc_uint const sg_idx = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;
+ // path builder data can be spread across two spans
+ skc_uint pb_idx = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);
+
+ // no need pb_elems make this branchless
+ if (pb_idx >= pb_size)
+ pb_idx -= pb_size;
+
+ // broadcast load the command
+ union skc_tagged_block_id const pb_cmd = pb_cmds[pb_idx];
+
+ // what do we want pb_elems do with this block?
+ skc_cmd_paths_copy_tag const tag = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);
+
+ // compute offset from rolling base to get index into block pool ring allocation
+ skc_uint const bp_off = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);
+
+ // convert the pb_cmd's offset counter pb_elems a block id
+ skc_block_id_t const block = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);
+
+#if 0
+ if (get_sub_group_local_id() == 0) {
+ printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);
+ printf("< %8u >\n",block);
+ }
+#endif
+
+ // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()
+ skc_uint const tid = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;
+
+ // calculate bp_elems (to) / pb_elems (from)
+ skc_uint const bp_elems_idx = block * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;
+ skc_uint const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK + tid;
+
+ if (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)
+ {
+#if 0
+ if (tid == 0)
+ printf("%3u, segs\n",bp_off);
+#endif
+ skc_copy_segs(bp_elems,
+ bp_elems_idx,
+ pb_elems,
+ pb_elems_idx);
+ }
+ else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)
+ {
+#if 0
+ if (tid == 0)
+ printf("%3u, NODE\n",bp_off);
+#endif
+ skc_copy_node(bp_elems, // to
+ bp_elems_idx,
+ bp_ids,
+ bp_reads,
+ bp_idx_mask,
+ pb_elems, // from
+ pb_elems_idx,
+ pb_rolling);
+ }
+ else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)
+ {
+#if 0
+ if (tid == 0)
+ printf("%3u, HEAD\n",bp_off);
+#endif
+ skc_copy_head(host_map,
+ block,
+ bp_elems, // to
+ bp_elems_idx,
+ bp_ids,
+ bp_reads,
+ bp_idx_mask,
+ pb_elems, // from
+ pb_elems_idx,
+ pb_rolling);
+ }
+}
+
+//
+//
+//
+
+__kernel
+SKC_PATHS_ALLOC_KERNEL_ATTRIBS
+void
+skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,
+ __global skc_uint * const bp_alloc,
+ skc_uint const bp_alloc_idx,
+ skc_uint const pb_cmd_count)
+{
+ //
+ // allocate blocks in block pool
+ //
+ skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);
+
+ // store in slot
+ bp_alloc[bp_alloc_idx] = reads;
+
+#if 0
+ printf("pc: %8u + %u\n",reads,pb_cmd_count);
+#endif
+}
+
+//
+//
+//