aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl
blob: 39fee75f3dcf940876413bb5f3d371a13c0121fe (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
/*
 * Copyright 2017 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can
 * be found in the LICENSE file.
 *
 */

//
//
//

#include "block.h"
#include "path.h"
#include "common.h"
#include "atomic_cl.h"
#include "raster_builder_cl_12.h"
#include "device_cl_12.h"

//
//
//

#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)

#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK    (SKC_DEVICE_BLOCK_WORDS    / SKC_FILLS_EXPAND_ELEM_WORDS)
#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)

#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD   (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE)

//
//
//

#define SKC_FILLS_EXPAND_X  (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE)

//
//
//

#if   ( SKC_FILLS_EXPAND_X == 1 )
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_1()
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  0

#elif ( SKC_FILLS_EXPAND_X == 2 )
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_2()
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  1

#elif ( SKC_FILLS_EXPAND_X == 4 )
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_4()
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  3

#elif ( SKC_FILLS_EXPAND_X == 8 )
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_8()
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  7

#elif ( SKC_FILLS_EXPAND_X == 16)
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_16()
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  15

#else
#error "MISSING SKC_FILLS_EXPAND_X"
#endif

//
// Fill and rasterize cmds only differ in their first word semantics
//

union skc_cmd_expand
{
  union skc_cmd_fill      fill;
  union skc_cmd_rasterize rasterize;
};

//
//
//

union skc_path_elem
{
  skc_uint  u32;
  skc_float f32;
};

//
// COMPILE-TIME AND RUN-TIME MACROS
//

#define SKC_ELEM_IN_RANGE(X,I)                                          \
  (skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) &&   \
  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)

#define SKC_ELEM_GTE(X,I)                                       \
  SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)

//
// FIXME -- slate these for replacement
//

#define SKC_BROADCAST(E,S,I)                                            \
  sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE)

#define SKC_BROADCAST_LAST_HELPER(E,I)                                  \
  sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)

#define SKC_BROADCAST_LAST(E,I)                 \
  SKC_BROADCAST_LAST_HELPER(E,I)

//
//
//

void
skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out,
                    skc_uint                         * const out_idx,
                    union skc_cmd_expand             * const cmd,
                    union skc_path_elem                const e,
                    skc_uint                           const e_idx)
{
  //
  // FIXME -- we can append a large number of nodeword indices to a
  // local SMEM queue and flush when full.  It may or may not be a
  // performance win on some architectures.
  //
  skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT;
  skc_uint const offset  = sub_group_scan_inclusive_add(is_elem ? 1 : 0);

  cmd->rasterize.nodeword = e_idx;

  if (is_elem) {
    cmds_out[*out_idx + offset] = cmd->rasterize;
  }

  *out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1);
}

//
//
//

__kernel
SKC_FILLS_EXPAND_KERNEL_ATTRIBS
void
skc_kernel_fills_expand(__global union skc_path_elem     const    * const blocks,
                        __global skc_uint                volatile * const atomics,
                        __global skc_block_id_t          const    * const map,
                        __global union skc_cmd_fill      const    * const cmds_in,
                        __global union skc_cmd_rasterize          * const cmds_out)
{
  //
  // Need to harmonize the way we determine a subgroup's id.  In this
  // kernel it's not as important because no local memory is being
  // used.  Although the device/mask calc to determine subgroup and
  // lanes is still proper, we might want to make it clearer that
  // we're working with subgroups by using the subgroup API.
  //
  // every subgroup/simd that will work on the block loads the same command
  //
#if (__OPENCL_VERSION__ < 200)
  skc_uint const       cmd_stride = get_num_sub_groups();
#else
  skc_uint const       cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
#endif
  skc_uint             cmd_idx    = get_group_id(0) * cmd_stride + get_sub_group_id();

  // load fill command -- we reuse y component
  union skc_cmd_expand cmd        = { .fill = cmds_in[cmd_idx] };

  // get the path header block from the map
  skc_block_id_t       id         = map[cmd.fill.path];

#if 0
  if (get_sub_group_local_id() == 0)
    printf("expand[%u] = %u\n",cmd_idx,id);
#endif

  //
  // blindly load all of the head elements into registers
  //
  skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();

#undef  SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R)                                         \
  union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];

  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();

  //
  // pick out count.nodes and count.prims from the header
  //
  skc_uint count_nodes, count_prims;

#undef  SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R)                                         \
  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) {                \
    count_nodes  = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I);       \
  }                                                                     \
  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) {                \
    count_prims  = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I);       \
  }

  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();

  //
  // debug of path head
  //
#if 0
  skc_uint count_blocks;

#undef  SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R)                                         \
  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) {               \
    count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I);      \
  }

  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();

  if (get_sub_group_local_id() == 0)
    printf("path header = { %5u, %5u, %5u }\n",
           count_blocks,count_nodes,count_prims);
#endif

  //
  // acquire slots in the expanded cmd extent
  //
  // decrement prim_idx by 1 so we can use inclusive warp scan later
  //
  skc_uint out_idx = 0;

  if (get_sub_group_local_id() == 0) {
    out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP
      (atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1;
  }

  out_idx = sub_group_broadcast(out_idx,0);

  //
  // process ids trailing the path header
  //
#undef  SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R)                                         \
  if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) {                      \
    if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) {                \
      if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \
        h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID;                         \
      }                                                                 \
    }                                                                   \
    skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I,                    \
                        head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \
  }

  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();

  //
  // we're done if it was just the header
  //
  if (count_nodes == 0)
    return;

  //
  // otherwise, process the nodes
  //

  //
  // get id of next node
  //
  id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));

  //
  // the following blocks are nodes
  //
  while (true)
    {
      // get index of each element
      skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();

      //
      // blindly load all of the node elements into registers
      //
#undef  SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R)                                         \
      union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];

      SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();

      //
      // append all valid ids
      //
#undef  SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R)                                         \
      skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I,                  \
                          node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE);

      SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();

      // any more nodes?
      if (--count_nodes == 0)
        return;

      //
      // get id of next node
      //
      id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
    }
}

//
//
//