src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

/*
 * Copyright 2017 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can
 * be found in the LICENSE file.
 *
 */

//
//
//

#include "kernel_cl_12.h"

//
// BEST TO RUN THESE ON AN OUT-OF-ORDER CQ
//

__kernel
SKC_BP_INIT_IDS_KERNEL_ATTRIBS
void
skc_kernel_block_pool_init_ids(__global uint * const ids, uint const bp_size)
{
  uint const gid = get_global_id(0);

  //
  // FIXME -- TUNE FOR ARCH -- evaluate if it's much faster to
  // accomplish this with fewer threads and using either IPC and/or
  // vector stores -- it should be on certain architectures!
  //

  //
  // initialize pool with sequence
  //
  if (gid < bp_size)
    ids[gid] = gid * SKC_DEVICE_SUBBLOCKS_PER_BLOCK;
}

//
//
//

__kernel
SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS
void
skc_kernel_block_pool_init_atomics(__global uint * const bp_atomics, uint const bp_size)
{
  // the version test is to squelch a bug with the Intel OpenCL CPU
  // compiler declaring it supports the cl_intel_subgroups extension
#if defined(cl_intel_subgroups) || defined (cl_khr_subgroups)
  uint const tid = get_sub_group_local_id();
#else
  uint const tid = get_local_id(0);
#endif

  //
  // launch two threads and store [ 0, bp_size ]
  //
  bp_atomics[tid] = tid * bp_size;
}

//
//
//