diff options
Diffstat (limited to 'src/compute/skc/platforms/cl_12/kernels/paths_copy.cl')
-rw-r--r-- | src/compute/skc/platforms/cl_12/kernels/paths_copy.cl | 1086 |
1 files changed, 543 insertions, 543 deletions
diff --git a/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl b/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl index 302ea14af2..63a1a43177 100644 --- a/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl +++ b/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl @@ -1,543 +1,543 @@ -/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "path.h"
-#include "block_pool_cl.h"
-#include "path_builder_cl_12.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#if 0
-
-//
-// SIMD AVX2
-//
-
-#define SKC_PATHS_COPY_WORDS_PER_ELEM 8
-#define SKC_PATHS_COPY_SUBGROUP_SIZE 1
-#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES
-
-typedef skc_uint8 skc_paths_copy_elem;
-typedef skc_uint8 skc_pb_idx_v;
-
-#define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_8()
-
-#define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + I >= SKC_PATH_HEAD_WORDS)
-
-#endif
-
-//
-//
-//
-
-#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)
-#define SKC_PATHS_COPY_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
-#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
-#define SKC_PATHS_COPY_ELEMS_PER_THREAD (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)
-
-// FIXME -- use SUBGROUP terminology everywhere
-#define SKC_PATHS_COPY_SUBGROUP_WORDS (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)
-
-//
-//
-//
-
-#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER \
- (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))
-
-#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER \
- (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))
-
-// #define SKC_PATHS_COPY_HEAD_ELEMS ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)
-
-//
-//
-//
-
-//
-// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL
-//
-
-#define SKC_CMD_PATHS_COPY_ONE_BITS (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
-
-#define SKC_CMD_PATHS_COPY_ONE_MASK SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)
-
-#define SKC_CMD_PATHS_COPY_ONE (1u << SKC_CMD_PATHS_COPY_ONE_BITS)
-
-#define SKC_CMD_PATHS_COPY_GET_TAG(ti) SKC_TAGGED_BLOCK_ID_GET_TAG(ti)
-
-#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti) ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)
-
-#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b) (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))
-
-//
-//
-//
-
-skc_uint
-skc_sub_group_local_id()
-{
-#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
- return get_sub_group_local_id();
-#else
- return 0;
-#endif
-}
-
-//
-// convert an atomic read counter offset to a block id
-//
-
-skc_block_id_t
-skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,
- skc_uint const bp_idx_mask,
- skc_uint const bp_reads,
- skc_uint const bp_off)
-{
- skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;
-
- return bp_ids[bp_idx];
-}
-
-//
-//
-//
-
-void
-skc_copy_segs(__global skc_paths_copy_elem * const bp_elems, // to
- skc_uint const bp_elems_idx,
- __global skc_paths_copy_elem const * const pb_elems, // from
- skc_uint const pb_elems_idx)
-{
- for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
- {
- (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];
- }
-
-#if 0
- //
- // NOTE THIS IS PRINTING 8 ROWS
- //
- printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
- (skc_uint)get_global_id(0),pb_elems_idx,
- as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));
- printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
- (skc_uint)get_global_id(0),pb_elems_idx,
- as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),
- as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));
-#endif
-}
-
-//
-//
-//
-
-void
-skc_copy_node(__global skc_paths_copy_elem * const bp_elems, // to
- skc_uint const bp_elems_idx,
- __global skc_block_id_t const * const bp_ids,
- skc_uint const bp_reads,
- skc_uint const bp_idx_mask,
- __global skc_paths_copy_elem const * const pb_elems, // from
- skc_uint const pb_elems_idx,
- skc_uint const pb_rolling)
-{
- //
- // remap block id tags bp_elems the host-side rolling counter pb_elems a
- // device-side block pool id
- //
- for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
- {
- // load block_id_tag words
- skc_paths_copy_elem elem = (pb_elems + pb_elems_idx)[ii];
-
- // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
- skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
-
- // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
-
- //
- // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
- // will _always_ be safe as long as we don't use the loaded
- // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
- // of iterating over the vector components.
- //
-
- // only convert if original elem is not invalid
-
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
- skc_block_id_t const b = bp_ids[bp_idx C]; \
- elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
- }
-
- // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);
-
- SKC_PATHS_COPY_ELEM_EXPAND();
-
- // store the elem back
- (bp_elems+bp_elems_idx)[ii] = elem;
- }
-}
-
-//
-//
-//
-
-void
-skc_host_map_update(__global skc_uint * const host_map,
- skc_uint const block,
- skc_paths_copy_elem const elem)
-{
- //
- // write first elem to map -- FIXME -- this is a little nasty
- // because it relies on the the host handle always being the first
- // word in the path header.
- //
- // OTOH, this is not unreasonable. The alternative is to have a
- // separate kernel initializing the map.
- //
-#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
- if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)
-#endif
- {
-#if SKC_PATHS_COPY_ELEM_WORDS == 1
- host_map[elem] = block;
-#if 0
- printf("[%u] = %u\n",elem,block);
-#endif
-#else
- host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;
-#endif
- }
-}
-
-//
-//
-//
-
-void
-skc_copy_head(__global skc_uint * const host_map,
- skc_uint const block,
- __global skc_paths_copy_elem * const bp_elems, // to
- skc_uint const bp_elems_idx,
- __global skc_block_id_t const * const bp_ids,
- skc_uint const bp_reads,
- skc_uint const bp_idx_mask,
- __global skc_paths_copy_elem const * const pb_elems, // from
- skc_uint const pb_elems_idx,
- skc_uint const pb_rolling)
-{
- //
- // if there are more path header words than there are
- // threads-per-block then we can just copy the initial header words
- //
-#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )
- for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
- {
- skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];
-
- (bp_elems+bp_elems_idx)[ii] = elem;
-
- if (ii == 0) {
- skc_host_map_update(host_map,block,elem);
- }
- }
-#endif
-
- //
- // this is similar to copy node but the first H words of the path
- // header are not modified and simply copied
- //
- for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
- {
- skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
-
-#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )
- if (ii == 0) {
- skc_host_map_update(host_map,block,elem);
- }
-#endif
- // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
- skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
-
- //
- // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
- // will _always_ be safe as long as we don't use the loaded
- // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
- // of iterating over the vector components.
- //
-
- // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
-
- // FIXME -- MIX MIX MIX MIX / SELECT
-
- // only convert if original elem is not invalid
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \
- skc_block_id_t const b = bp_ids[bp_idx C]; \
- elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
- }
-
- // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);
-
- SKC_PATHS_COPY_ELEM_EXPAND();
-
- // store the elem back
- (bp_elems+bp_elems_idx)[ii] = elem;
- }
-
- //
- // the remaining words are treated like a node
- //
- for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
- {
- // load block_id_tag words
- skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
-
- // calculate ahead of time
- skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
-
- //
- // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
- // will _always_ be safe as long as we don't use the loaded
- // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead
- // of iterating over the vector components.
- //
-
- // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
-
- // only convert if original elem is not invalid
-#undef SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) \
- if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \
- skc_block_id_t const b = bp_ids[bp_idx C]; \
- elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \
- }
-
- // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);
-
- SKC_PATHS_COPY_ELEM_EXPAND();
-
- // store the elem
- (bp_elems+bp_elems_idx)[ii] = elem;
- }
-}
-
-//
-// FIXME -- pack some of these constant integer args in a vec or struct
-//
-
-__kernel
-SKC_PATHS_COPY_KERNEL_ATTRIBS
-void
-skc_kernel_paths_copy
-(__global skc_uint * const host_map,
-
- __global skc_block_id_t const * const bp_ids,
- __global skc_paths_copy_elem * const bp_elems,
- skc_uint const bp_idx_mask, // pow2 modulo mask for block pool ring
-
- __global skc_uint const * const bp_alloc, // block pool ring base
- skc_uint const bp_alloc_idx,// which subbuf
-
- __global union skc_tagged_block_id const * const pb_cmds,
- __global skc_paths_copy_elem const * const pb_elems,
-
- skc_uint const pb_size, // # of commands/blocks in buffer
- skc_uint const pb_rolling, // shifted rolling counter base
-
- skc_uint const pb_prev_from,
- skc_uint const pb_prev_span,
- skc_uint const pb_curr_from)
-{
- //
- // THERE ARE 3 TYPES OF PATH COPYING COMMANDS:
- //
- // - HEAD
- // - NODE
- // - SEGS
- //
- // THESE ARE SUBGROUP ORIENTED KERNELS
- //
- // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS
- //
-
- //
- // It's likely that peak bandwidth is achievable with a single
- // workgroup.
- //
- // So let's keep the grids modestly sized and for simplicity and
- // portability, let's assume that a single workgroup can perform all
- // steps in the copy.
- //
- // Launch as large of a workgroup as possiblex
- //
- // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL
- // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS
- // 3. FOR EACH COMMAND:
- // - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.
- // - NODE: CONVERT AND COPY B INDICES
- // - SEGS: BULK COPY
- //
- // B : number of words in block -- always pow2
- // W : intelligently/arbitrarily chosen factor of B -- always pow2
- //
-
- //
- // There are several approaches to processing the commands:
- //
- // 1. B threads are responsible for one block. All threads broadcast
- // load a single command word. Workgroup size must be a facpb_elemsr of
- // B.
- //
- // 2. W threads process an entire block. W will typically be the
- // device's subgroup/warp/wave width. W threads broadcast load a
- // single command word.
- //
- // 3. W threads process W blocks. W threads load W command words and
- // process W blocks.
- //
- // Clearly (1) has low I/O intensity but will achieve high
- // parallelism by activating the most possible threads. The downside
- // of this kind of approach is that the kernel will occupy even a
- // large GPU with low intensity work and reduce opportunities for
- // concurrent kernel execution (of other kernels).
- //
- // See Vasily Volkov's CUDA presentation describing these tradeoffs.
- //
- // Note that there are many other approaches. For example, similar
- // pb_elems (1) but each thread loads a pow2 vector of block data.
- //
-
- // load the copied atomic read "base" from gmem
- skc_uint const bp_reads = bp_alloc[bp_alloc_idx];
- // will always be less than 2^32
- skc_uint const gid = get_global_id(0);
- // every subgroup/simd that will work on the block loads the same command
- skc_uint const sg_idx = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;
- // path builder data can be spread across two spans
- skc_uint pb_idx = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);
-
- // no need pb_elems make this branchless
- if (pb_idx >= pb_size)
- pb_idx -= pb_size;
-
- // broadcast load the command
- union skc_tagged_block_id const pb_cmd = pb_cmds[pb_idx];
-
- // what do we want pb_elems do with this block?
- skc_cmd_paths_copy_tag const tag = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);
-
- // compute offset from rolling base to get index into block pool ring allocation
- skc_uint const bp_off = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);
-
- // convert the pb_cmd's offset counter pb_elems a block id
- skc_block_id_t const block = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);
-
-#if 0
- if (get_sub_group_local_id() == 0) {
- printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);
- printf("< %8u >\n",block);
- }
-#endif
-
- // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()
- skc_uint const tid = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;
-
- // calculate bp_elems (to) / pb_elems (from)
- skc_uint const bp_elems_idx = block * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;
- skc_uint const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK + tid;
-
- if (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)
- {
-#if 0
- if (tid == 0)
- printf("%3u, segs\n",bp_off);
-#endif
- skc_copy_segs(bp_elems,
- bp_elems_idx,
- pb_elems,
- pb_elems_idx);
- }
- else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)
- {
-#if 0
- if (tid == 0)
- printf("%3u, NODE\n",bp_off);
-#endif
- skc_copy_node(bp_elems, // to
- bp_elems_idx,
- bp_ids,
- bp_reads,
- bp_idx_mask,
- pb_elems, // from
- pb_elems_idx,
- pb_rolling);
- }
- else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)
- {
-#if 0
- if (tid == 0)
- printf("%3u, HEAD\n",bp_off);
-#endif
- skc_copy_head(host_map,
- block,
- bp_elems, // to
- bp_elems_idx,
- bp_ids,
- bp_reads,
- bp_idx_mask,
- pb_elems, // from
- pb_elems_idx,
- pb_rolling);
- }
-}
-
-//
-//
-//
-
-__kernel
-SKC_PATHS_ALLOC_KERNEL_ATTRIBS
-void
-skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,
- __global skc_uint * const bp_alloc,
- skc_uint const bp_alloc_idx,
- skc_uint const pb_cmd_count)
-{
- //
- // allocate blocks in block pool
- //
- skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);
-
- // store in slot
- bp_alloc[bp_alloc_idx] = reads;
-
-#if 0
- printf("pc: %8u + %u\n",reads,pb_cmd_count);
-#endif
-}
-
-//
-//
-//
+/* + * Copyright 2017 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can + * be found in the LICENSE file. + * + */ + +// +// +// + +#include "path.h" +#include "block_pool_cl.h" +#include "path_builder_cl_12.h" +#include "kernel_cl_12.h" + +// +// +// + +#if 0 + +// +// SIMD AVX2 +// + +#define SKC_PATHS_COPY_WORDS_PER_ELEM 8 +#define SKC_PATHS_COPY_SUBGROUP_SIZE 1 +#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES + +typedef skc_uint8 skc_paths_copy_elem; +typedef skc_uint8 skc_pb_idx_v; + +#define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_8() + +#define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + I >= SKC_PATH_HEAD_WORDS) + +#endif + +// +// +// + +#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK (SKC_PATHS_COPY_SUBGROUP_SIZE - 1) +#define SKC_PATHS_COPY_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS) +#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS) +#define SKC_PATHS_COPY_ELEMS_PER_THREAD (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE) + +// FIXME -- use SUBGROUP terminology everywhere +#define SKC_PATHS_COPY_SUBGROUP_WORDS (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS) + +// +// +// + +#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER \ + (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS)) + +#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER \ + (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS)) + +// #define SKC_PATHS_COPY_HEAD_ELEMS ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS) + +// +// +// + +// +// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL +// + +#define SKC_CMD_PATHS_COPY_ONE_BITS (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2) + +#define SKC_CMD_PATHS_COPY_ONE_MASK SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS) + +#define SKC_CMD_PATHS_COPY_ONE (1u << SKC_CMD_PATHS_COPY_ONE_BITS) + +#define SKC_CMD_PATHS_COPY_GET_TAG(ti) SKC_TAGGED_BLOCK_ID_GET_TAG(ti) + +#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti) ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS) + +#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b) (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG)) + +// +// +// + +skc_uint +skc_sub_group_local_id() +{ +#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1 + return get_sub_group_local_id(); +#else + return 0; +#endif +} + +// +// convert an atomic read counter offset to a block id +// + +skc_block_id_t +skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids, + skc_uint const bp_idx_mask, + skc_uint const bp_reads, + skc_uint const bp_off) +{ + skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask; + + return bp_ids[bp_idx]; +} + +// +// +// + +void +skc_copy_segs(__global skc_paths_copy_elem * const bp_elems, // to + skc_uint const bp_elems_idx, + __global skc_paths_copy_elem const * const pb_elems, // from + skc_uint const pb_elems_idx) +{ + for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) + { + (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii]; + } + +#if 0 + // + // NOTE THIS IS PRINTING 8 ROWS + // + printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n", + (skc_uint)get_global_id(0),pb_elems_idx, + as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]), + as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]), + as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]), + as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE])); + printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n", + (skc_uint)get_global_id(0),pb_elems_idx, + as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]), + as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]), + as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]), + as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE])); +#endif +} + +// +// +// + +void +skc_copy_node(__global skc_paths_copy_elem * const bp_elems, // to + skc_uint const bp_elems_idx, + __global skc_block_id_t const * const bp_ids, + skc_uint const bp_reads, + skc_uint const bp_idx_mask, + __global skc_paths_copy_elem const * const pb_elems, // from + skc_uint const pb_elems_idx, + skc_uint const pb_rolling) +{ + // + // remap block id tags bp_elems the host-side rolling counter pb_elems a + // device-side block pool id + // + for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) + { + // load block_id_tag words + skc_paths_copy_elem elem = (pb_elems + pb_elems_idx)[ii]; + + // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid + skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask; + + // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS + + // + // FIXME -- SIMD can be fully parallelized since a bp_ids[] load + // will _always_ be safe as long as we don't use the loaded + // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead + // of iterating over the vector components. + // + + // only convert if original elem is not invalid + +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \ + skc_block_id_t const b = bp_ids[bp_idx C]; \ + elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \ + } + + // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C); + + SKC_PATHS_COPY_ELEM_EXPAND(); + + // store the elem back + (bp_elems+bp_elems_idx)[ii] = elem; + } +} + +// +// +// + +void +skc_host_map_update(__global skc_uint * const host_map, + skc_uint const block, + skc_paths_copy_elem const elem) +{ + // + // write first elem to map -- FIXME -- this is a little nasty + // because it relies on the the host handle always being the first + // word in the path header. + // + // OTOH, this is not unreasonable. The alternative is to have a + // separate kernel initializing the map. + // +#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1 + if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE) +#endif + { +#if SKC_PATHS_COPY_ELEM_WORDS == 1 + host_map[elem] = block; +#if 0 + printf("[%u] = %u\n",elem,block); +#endif +#else + host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block; +#endif + } +} + +// +// +// + +void +skc_copy_head(__global skc_uint * const host_map, + skc_uint const block, + __global skc_paths_copy_elem * const bp_elems, // to + skc_uint const bp_elems_idx, + __global skc_block_id_t const * const bp_ids, + skc_uint const bp_reads, + skc_uint const bp_idx_mask, + __global skc_paths_copy_elem const * const pb_elems, // from + skc_uint const pb_elems_idx, + skc_uint const pb_rolling) +{ + // + // if there are more path header words than there are + // threads-per-block then we can just copy the initial header words + // +#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 ) + for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) + { + skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii]; + + (bp_elems+bp_elems_idx)[ii] = elem; + + if (ii == 0) { + skc_host_map_update(host_map,block,elem); + } + } +#endif + + // + // this is similar to copy node but the first H words of the path + // header are not modified and simply copied + // + for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) + { + skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii]; + +#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 ) + if (ii == 0) { + skc_host_map_update(host_map,block,elem); + } +#endif + // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid + skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask; + + // + // FIXME -- SIMD can be fully parallelized since a bp_ids[] load + // will _always_ be safe as long as we don't use the loaded + // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead + // of iterating over the vector components. + // + + // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS + + // FIXME -- MIX MIX MIX MIX / SELECT + + // only convert if original elem is not invalid +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \ + skc_block_id_t const b = bp_ids[bp_idx C]; \ + elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \ + } + + // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C); + + SKC_PATHS_COPY_ELEM_EXPAND(); + + // store the elem back + (bp_elems+bp_elems_idx)[ii] = elem; + } + + // + // the remaining words are treated like a node + // + for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) + { + // load block_id_tag words + skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii]; + + // calculate ahead of time + skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask; + + // + // FIXME -- SIMD can be fully parallelized since a bp_ids[] load + // will _always_ be safe as long as we don't use the loaded + // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead + // of iterating over the vector components. + // + + // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS + + // only convert if original elem is not invalid +#undef SKC_EXPAND_X +#define SKC_EXPAND_X(I,S,C,P,R) \ + if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \ + skc_block_id_t const b = bp_ids[bp_idx C]; \ + elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \ + } + + // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C); + + SKC_PATHS_COPY_ELEM_EXPAND(); + + // store the elem + (bp_elems+bp_elems_idx)[ii] = elem; + } +} + +// +// FIXME -- pack some of these constant integer args in a vec or struct +// + +__kernel +SKC_PATHS_COPY_KERNEL_ATTRIBS +void +skc_kernel_paths_copy +(__global skc_uint * const host_map, + + __global skc_block_id_t const * const bp_ids, + __global skc_paths_copy_elem * const bp_elems, + skc_uint const bp_idx_mask, // pow2 modulo mask for block pool ring + + __global skc_uint const * const bp_alloc, // block pool ring base + skc_uint const bp_alloc_idx,// which subbuf + + __global union skc_tagged_block_id const * const pb_cmds, + __global skc_paths_copy_elem const * const pb_elems, + + skc_uint const pb_size, // # of commands/blocks in buffer + skc_uint const pb_rolling, // shifted rolling counter base + + skc_uint const pb_prev_from, + skc_uint const pb_prev_span, + skc_uint const pb_curr_from) +{ + // + // THERE ARE 3 TYPES OF PATH COPYING COMMANDS: + // + // - HEAD + // - NODE + // - SEGS + // + // THESE ARE SUBGROUP ORIENTED KERNELS + // + // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS + // + + // + // It's likely that peak bandwidth is achievable with a single + // workgroup. + // + // So let's keep the grids modestly sized and for simplicity and + // portability, let's assume that a single workgroup can perform all + // steps in the copy. + // + // Launch as large of a workgroup as possiblex + // + // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL + // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS + // 3. FOR EACH COMMAND: + // - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES. + // - NODE: CONVERT AND COPY B INDICES + // - SEGS: BULK COPY + // + // B : number of words in block -- always pow2 + // W : intelligently/arbitrarily chosen factor of B -- always pow2 + // + + // + // There are several approaches to processing the commands: + // + // 1. B threads are responsible for one block. All threads broadcast + // load a single command word. Workgroup size must be a facpb_elemsr of + // B. + // + // 2. W threads process an entire block. W will typically be the + // device's subgroup/warp/wave width. W threads broadcast load a + // single command word. + // + // 3. W threads process W blocks. W threads load W command words and + // process W blocks. + // + // Clearly (1) has low I/O intensity but will achieve high + // parallelism by activating the most possible threads. The downside + // of this kind of approach is that the kernel will occupy even a + // large GPU with low intensity work and reduce opportunities for + // concurrent kernel execution (of other kernels). + // + // See Vasily Volkov's CUDA presentation describing these tradeoffs. + // + // Note that there are many other approaches. For example, similar + // pb_elems (1) but each thread loads a pow2 vector of block data. + // + + // load the copied atomic read "base" from gmem + skc_uint const bp_reads = bp_alloc[bp_alloc_idx]; + // will always be less than 2^32 + skc_uint const gid = get_global_id(0); + // every subgroup/simd that will work on the block loads the same command + skc_uint const sg_idx = gid / SKC_PATHS_COPY_SUBGROUP_SIZE; + // path builder data can be spread across two spans + skc_uint pb_idx = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from); + + // no need pb_elems make this branchless + if (pb_idx >= pb_size) + pb_idx -= pb_size; + + // broadcast load the command + union skc_tagged_block_id const pb_cmd = pb_cmds[pb_idx]; + + // what do we want pb_elems do with this block? + skc_cmd_paths_copy_tag const tag = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32); + + // compute offset from rolling base to get index into block pool ring allocation + skc_uint const bp_off = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling); + + // convert the pb_cmd's offset counter pb_elems a block id + skc_block_id_t const block = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off); + +#if 0 + if (get_sub_group_local_id() == 0) { + printf("bp_off/reads = %u / %u\n",bp_off,bp_reads); + printf("< %8u >\n",block); + } +#endif + + // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id() + skc_uint const tid = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK; + + // calculate bp_elems (to) / pb_elems (from) + skc_uint const bp_elems_idx = block * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid; + skc_uint const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK + tid; + + if (tag == SKC_CMD_PATHS_COPY_TAG_SEGS) + { +#if 0 + if (tid == 0) + printf("%3u, segs\n",bp_off); +#endif + skc_copy_segs(bp_elems, + bp_elems_idx, + pb_elems, + pb_elems_idx); + } + else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE) + { +#if 0 + if (tid == 0) + printf("%3u, NODE\n",bp_off); +#endif + skc_copy_node(bp_elems, // to + bp_elems_idx, + bp_ids, + bp_reads, + bp_idx_mask, + pb_elems, // from + pb_elems_idx, + pb_rolling); + } + else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD) + { +#if 0 + if (tid == 0) + printf("%3u, HEAD\n",bp_off); +#endif + skc_copy_head(host_map, + block, + bp_elems, // to + bp_elems_idx, + bp_ids, + bp_reads, + bp_idx_mask, + pb_elems, // from + pb_elems_idx, + pb_rolling); + } +} + +// +// +// + +__kernel +SKC_PATHS_ALLOC_KERNEL_ATTRIBS +void +skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics, + __global skc_uint * const bp_alloc, + skc_uint const bp_alloc_idx, + skc_uint const pb_cmd_count) +{ + // + // allocate blocks in block pool + // + skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count); + + // store in slot + bp_alloc[bp_alloc_idx] = reads; + +#if 0 + printf("pc: %8u + %u\n",reads,pb_cmd_count); +#endif +} + +// +// +// |