src/compute/skc/platforms/cl_12/kernels/paths_copy.cl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543

/*
 * Copyright 2017 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can
 * be found in the LICENSE file.
 *
 */

//
//
//

#include "path.h"
#include "block_pool_cl.h"
#include "path_builder_cl_12.h"
#include "kernel_cl_12.h"

//
//
//

#if 0

//
// SIMD AVX2
//

#define SKC_PATHS_COPY_WORDS_PER_ELEM          8
#define SKC_PATHS_COPY_SUBGROUP_SIZE           1
#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES

typedef skc_uint8  skc_paths_copy_elem;
typedef skc_uint8  skc_pb_idx_v;

#define SKC_PATHS_COPY_ELEM_EXPAND()           SKC_EXPAND_8()

#define SKC_IS_NOT_PATH_HEAD(sg,I)             ((sg) + I >= SKC_PATH_HEAD_WORDS)

#endif

//
//
//

#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK      (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)
#define SKC_PATHS_COPY_ELEMS_PER_BLOCK         (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK      (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
#define SKC_PATHS_COPY_ELEMS_PER_THREAD        (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)

// FIXME -- use SUBGROUP terminology everywhere
#define SKC_PATHS_COPY_SUBGROUP_WORDS          (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)

//
//
//

#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER                              \
  (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))

#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER                           \
  (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))

// #define SKC_PATHS_COPY_HEAD_ELEMS    ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)

//
//
//

//
// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL
//

#define SKC_CMD_PATHS_COPY_ONE_BITS              (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)

#define SKC_CMD_PATHS_COPY_ONE_MASK              SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)

#define SKC_CMD_PATHS_COPY_ONE                   (1u << SKC_CMD_PATHS_COPY_ONE_BITS)

#define SKC_CMD_PATHS_COPY_GET_TAG(ti)           SKC_TAGGED_BLOCK_ID_GET_TAG(ti)

#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti)       ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)

#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b)  (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))

//
//
//

skc_uint
skc_sub_group_local_id()
{
#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
  return get_sub_group_local_id();
#else
  return 0;
#endif
}

//
// convert an atomic read counter offset to a block id
//

skc_block_id_t
skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,
                 skc_uint                        const bp_idx_mask,
                 skc_uint                        const bp_reads,
                 skc_uint                        const bp_off)
{
  skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;

  return bp_ids[bp_idx];
}

//
//
//

void
skc_copy_segs(__global skc_paths_copy_elem       * const bp_elems, // to
              skc_uint                             const bp_elems_idx,
              __global skc_paths_copy_elem const * const pb_elems, // from
              skc_uint                             const pb_elems_idx)
{
  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
    {
      (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];
    }

#if 0
  //
  // NOTE THIS IS PRINTING 8 ROWS
  //
  printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
         (skc_uint)get_global_id(0),pb_elems_idx,
         as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),
         as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),
         as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),
         as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));
  printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
         (skc_uint)get_global_id(0),pb_elems_idx,
         as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),
         as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),
         as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),
         as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));
#endif
}

//
//
//

void
skc_copy_node(__global skc_paths_copy_elem       * const bp_elems, // to
              skc_uint                             const bp_elems_idx,
              __global skc_block_id_t      const * const bp_ids,
              skc_uint                             const bp_reads,
              skc_uint                             const bp_idx_mask,
              __global skc_paths_copy_elem const * const pb_elems, // from
              skc_uint                             const pb_elems_idx,
              skc_uint                             const pb_rolling)
{
  //
  // remap block id tags bp_elems the host-side rolling counter pb_elems a
  // device-side block pool id
  //
  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
    {
      // load block_id_tag words
      skc_paths_copy_elem elem   = (pb_elems + pb_elems_idx)[ii];

      // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
      skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;

      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS

      //
      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
      // will _always_ be safe as long as we don't use the loaded
      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
      // of iterating over the vector components.
      //

      // only convert if original elem is not invalid

#undef  SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R)                                 \
      if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \
        skc_block_id_t const b = bp_ids[bp_idx C];              \
        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \
      }

      // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);
      
      SKC_PATHS_COPY_ELEM_EXPAND();

      // store the elem back
      (bp_elems+bp_elems_idx)[ii] = elem;
    }
}

//
//
//

void
skc_host_map_update(__global skc_uint * const host_map,
                    skc_uint            const block,
                    skc_paths_copy_elem const elem)
{
  //
  // write first elem to map -- FIXME -- this is a little nasty
  // because it relies on the the host handle always being the first
  // word in the path header.
  //
  // OTOH, this is not unreasonable.  The alternative is to have a
  // separate kernel initializing the map.
  //
#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
  if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)
#endif
    {
#if SKC_PATHS_COPY_ELEM_WORDS == 1
      host_map[elem] = block; 
#if 0
      printf("[%u] = %u\n",elem,block);
#endif
#else
      host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;
#endif
    }
}

//
//
//

void
skc_copy_head(__global skc_uint                  * const host_map,
              skc_uint                             const block,
              __global skc_paths_copy_elem       * const bp_elems, // to
              skc_uint                             const bp_elems_idx,
              __global skc_block_id_t      const * const bp_ids,
              skc_uint                             const bp_reads,
              skc_uint                             const bp_idx_mask,
              __global skc_paths_copy_elem const * const pb_elems, // from
              skc_uint                             const pb_elems_idx,
              skc_uint                             const pb_rolling)
{
  //
  // if there are more path header words than there are
  // threads-per-block then we can just copy the initial header words
  //
#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )
  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
    {
      skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];

      (bp_elems+bp_elems_idx)[ii] = elem;

      if (ii == 0) {
        skc_host_map_update(host_map,block,elem);
      }
    }
#endif

  //
  // this is similar to copy node but the first H words of the path
  // header are not modified and simply copied
  //
  for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
    {
      skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];

#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )
      if (ii == 0) {
        skc_host_map_update(host_map,block,elem);
      }
#endif
      // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
      skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;

      //
      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
      // will _always_ be safe as long as we don't use the loaded
      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
      // of iterating over the vector components.
      //

      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS

      // FIXME -- MIX MIX MIX MIX / SELECT

      // only convert if original elem is not invalid
#undef  SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R)                                         \
      if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \
        skc_block_id_t const b = bp_ids[bp_idx C];                      \
        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);           \
      }

      // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);

      SKC_PATHS_COPY_ELEM_EXPAND();

      // store the elem back
      (bp_elems+bp_elems_idx)[ii] = elem;
    }

  //
  // the remaining words are treated like a node
  //
  for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
    {
      // load block_id_tag words
      skc_paths_copy_elem elem   = (pb_elems+pb_elems_idx)[ii];

      // calculate ahead of time
      skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;

      //
      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
      // will _always_ be safe as long as we don't use the loaded
      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
      // of iterating over the vector components.
      //

      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS

      // only convert if original elem is not invalid
#undef  SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R)                                 \
      if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \
        skc_block_id_t const b = bp_ids[bp_idx C];              \
        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \
      }

      // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);

      SKC_PATHS_COPY_ELEM_EXPAND();

      // store the elem
      (bp_elems+bp_elems_idx)[ii] = elem;
    }
}

//
// FIXME -- pack some of these constant integer args in a vec or struct
//

__kernel
SKC_PATHS_COPY_KERNEL_ATTRIBS
void
skc_kernel_paths_copy
(__global skc_uint                        * const host_map,

 __global skc_block_id_t            const * const bp_ids,
 __global skc_paths_copy_elem             * const bp_elems,
 skc_uint                                   const bp_idx_mask, // pow2 modulo mask for block pool ring

 __global skc_uint                  const * const bp_alloc,    // block pool ring base
 skc_uint                                   const bp_alloc_idx,// which subbuf

 __global union skc_tagged_block_id const * const pb_cmds,
 __global skc_paths_copy_elem       const * const pb_elems,

 skc_uint                                   const pb_size,     // # of commands/blocks in buffer
 skc_uint                                   const pb_rolling,  // shifted rolling counter base

 skc_uint                                   const pb_prev_from,
 skc_uint                                   const pb_prev_span,
 skc_uint                                   const pb_curr_from)
{
  //
  // THERE ARE 3 TYPES OF PATH COPYING COMMANDS:
  //
  // - HEAD
  // - NODE
  // - SEGS
  //
  // THESE ARE SUBGROUP ORIENTED KERNELS
  //
  // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS
  //

  //
  // It's likely that peak bandwidth is achievable with a single
  // workgroup.
  //
  // So let's keep the grids modestly sized and for simplicity and
  // portability, let's assume that a single workgroup can perform all
  // steps in the copy.
  //
  // Launch as large of a workgroup as possiblex
  //
  // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL
  // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS
  // 3. FOR EACH COMMAND:
  //      - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.
  //      - NODE: CONVERT AND COPY B INDICES
  //      - SEGS: BULK COPY
  //
  // B : number of words in block -- always pow2
  // W : intelligently/arbitrarily chosen factor of B -- always pow2
  //

  //
  // There are several approaches to processing the commands:
  //
  // 1. B threads are responsible for one block. All threads broadcast
  //    load a single command word. Workgroup size must be a facpb_elemsr of
  //    B.
  //
  // 2. W threads process an entire block. W will typically be the
  //    device's subgroup/warp/wave width. W threads broadcast load a
  //    single command word.
  //
  // 3. W threads process W blocks. W threads load W command words and
  //    process W blocks.
  //
  // Clearly (1) has low I/O intensity but will achieve high
  // parallelism by activating the most possible threads. The downside
  // of this kind of approach is that the kernel will occupy even a
  // large GPU with low intensity work and reduce opportunities for
  // concurrent kernel execution (of other kernels).
  //
  // See Vasily Volkov's CUDA presentation describing these tradeoffs.
  //
  // Note that there are many other approaches.  For example, similar
  // pb_elems (1) but each thread loads a pow2 vector of block data.
  //

  // load the copied atomic read "base" from gmem
  skc_uint const bp_reads = bp_alloc[bp_alloc_idx];
  // will always be less than 2^32
  skc_uint const gid      = get_global_id(0);
  // every subgroup/simd that will work on the block loads the same command
  skc_uint const sg_idx   = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;
  // path builder data can be spread across two spans
  skc_uint       pb_idx   = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);

  // no need pb_elems make this branchless
  if (pb_idx >= pb_size)
    pb_idx -= pb_size;

  // broadcast load the command
  union skc_tagged_block_id const pb_cmd       = pb_cmds[pb_idx];

  // what do we want pb_elems do with this block?
  skc_cmd_paths_copy_tag    const tag          = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);

  // compute offset from rolling base to get index into block pool ring allocation
  skc_uint                  const bp_off       = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);

  // convert the pb_cmd's offset counter pb_elems a block id
  skc_block_id_t            const block        = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);

#if 0
  if (get_sub_group_local_id() == 0) {
    printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);
    printf("< %8u >\n",block);
  }
#endif

  // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()
  skc_uint                 const tid          = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;

  // calculate bp_elems (to) / pb_elems (from)
  skc_uint                 const bp_elems_idx = block  * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;
  skc_uint                 const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK    + tid;

  if      (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)
    {
#if 0
      if (tid == 0)
        printf("%3u, segs\n",bp_off);
#endif
      skc_copy_segs(bp_elems,
                    bp_elems_idx,
                    pb_elems,
                    pb_elems_idx);
    }
  else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)
    {
#if 0
      if (tid == 0)
        printf("%3u, NODE\n",bp_off);
#endif
      skc_copy_node(bp_elems, // to
                    bp_elems_idx,
                    bp_ids,
                    bp_reads,
                    bp_idx_mask,
                    pb_elems, // from
                    pb_elems_idx,
                    pb_rolling);
    }
  else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)
    {
#if 0
      if (tid == 0)
        printf("%3u, HEAD\n",bp_off);
#endif
      skc_copy_head(host_map,
                    block,
                    bp_elems, // to
                    bp_elems_idx,
                    bp_ids,
                    bp_reads,
                    bp_idx_mask,
                    pb_elems, // from
                    pb_elems_idx,
                    pb_rolling);
    }
}

//
//
//

__kernel
SKC_PATHS_ALLOC_KERNEL_ATTRIBS
void
skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,
                       __global skc_uint          * const bp_alloc,
                       skc_uint                     const bp_alloc_idx,
                       skc_uint                     const pb_cmd_count)
{
  //
  // allocate blocks in block pool
  //
  skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);

  // store in slot
  bp_alloc[bp_alloc_idx] = reads;

#if 0
  printf("pc: %8u + %u\n",reads,pb_cmd_count);
#endif
}

//
//
//